C API for speaker diarization (#1402)
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -120,3 +120,4 @@ vits-melo-tts-zh_en
|
|||||||
sherpa-onnx-online-punct-en-2024-08-06
|
sherpa-onnx-online-punct-en-2024-08-06
|
||||||
*.mp4
|
*.mp4
|
||||||
*.mp3
|
*.mp3
|
||||||
|
sherpa-onnx-pyannote-segmentation-3-0
|
||||||
|
|||||||
11
README.md
11
README.md
@@ -1,8 +1,12 @@
|
|||||||
### Supported functions
|
### Supported functions
|
||||||
|
|
||||||
|Speech recognition| Speech synthesis | Speaker verification | Speaker identification |
|
|Speech recognition| Speech synthesis |
|
||||||
|------------------|------------------|----------------------|------------------------|
|
|------------------|------------------|
|
||||||
| ✔️ | ✔️ | ✔️ | ✔️ |
|
| ✔️ | ✔️ |
|
||||||
|
|
||||||
|
|Speaker identification| Speaker diarization | Speaker identification |
|
||||||
|
|----------------------|-------------------- |------------------------|
|
||||||
|
| ✔️ | ✔️ | ✔️ |
|
||||||
|
|
||||||
| Spoken Language identification | Audio tagging | Voice activity detection |
|
| Spoken Language identification | Audio tagging | Voice activity detection |
|
||||||
|--------------------------------|---------------|--------------------------|
|
|--------------------------------|---------------|--------------------------|
|
||||||
@@ -47,6 +51,7 @@ This repository supports running the following functions **locally**
|
|||||||
|
|
||||||
- Speech-to-text (i.e., ASR); both streaming and non-streaming are supported
|
- Speech-to-text (i.e., ASR); both streaming and non-streaming are supported
|
||||||
- Text-to-speech (i.e., TTS)
|
- Text-to-speech (i.e., TTS)
|
||||||
|
- Speaker diarization
|
||||||
- Speaker identification
|
- Speaker identification
|
||||||
- Speaker verification
|
- Speaker verification
|
||||||
- Spoken language identification
|
- Spoken language identification
|
||||||
|
|||||||
@@ -9,6 +9,11 @@ if(SHERPA_ONNX_ENABLE_TTS)
|
|||||||
target_link_libraries(offline-tts-c-api sherpa-onnx-c-api cargs)
|
target_link_libraries(offline-tts-c-api sherpa-onnx-c-api cargs)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if(SHERPA_ONNX_ENABLE_SPEAKER_DIARIZATION)
|
||||||
|
add_executable(offline-speaker-diarization-c-api offline-speaker-diarization-c-api.c)
|
||||||
|
target_link_libraries(offline-speaker-diarization-c-api sherpa-onnx-c-api)
|
||||||
|
endif()
|
||||||
|
|
||||||
add_executable(spoken-language-identification-c-api spoken-language-identification-c-api.c)
|
add_executable(spoken-language-identification-c-api spoken-language-identification-c-api.c)
|
||||||
target_link_libraries(spoken-language-identification-c-api sherpa-onnx-c-api)
|
target_link_libraries(spoken-language-identification-c-api sherpa-onnx-c-api)
|
||||||
|
|
||||||
|
|||||||
131
c-api-examples/offline-speaker-diarization-c-api.c
Normal file
131
c-api-examples/offline-speaker-diarization-c-api.c
Normal file
@@ -0,0 +1,131 @@
|
|||||||
|
// c-api-examples/offline-sepaker-diarization-c-api.c
|
||||||
|
//
|
||||||
|
// Copyright (c) 2024 Xiaomi Corporation
|
||||||
|
|
||||||
|
//
|
||||||
|
// This file demonstrates how to implement speaker diarization with
|
||||||
|
// sherpa-onnx's C API.
|
||||||
|
|
||||||
|
// clang-format off
|
||||||
|
/*
|
||||||
|
Usage:
|
||||||
|
|
||||||
|
Step 1: Download a speaker segmentation model
|
||||||
|
|
||||||
|
Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models
|
||||||
|
for a list of available models. The following is an example
|
||||||
|
|
||||||
|
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
|
||||||
|
tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
|
||||||
|
rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
|
||||||
|
|
||||||
|
Step 2: Download a speaker embedding extractor model
|
||||||
|
|
||||||
|
Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models
|
||||||
|
for a list of available models. The following is an example
|
||||||
|
|
||||||
|
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
|
||||||
|
|
||||||
|
Step 3. Download test wave files
|
||||||
|
|
||||||
|
Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models
|
||||||
|
for a list of available test wave files. The following is an example
|
||||||
|
|
||||||
|
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
|
||||||
|
|
||||||
|
Step 4. Run it
|
||||||
|
|
||||||
|
*/
|
||||||
|
// clang-format on
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
#include "sherpa-onnx/c-api/c-api.h"
|
||||||
|
|
||||||
|
static int32_t ProgressCallback(int32_t num_processed_chunks,
|
||||||
|
int32_t num_total_chunks, void *arg) {
|
||||||
|
float progress = 100.0 * num_processed_chunks / num_total_chunks;
|
||||||
|
fprintf(stderr, "progress %.2f%%\n", progress);
|
||||||
|
|
||||||
|
// the return value is currently ignored
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main() {
|
||||||
|
// Please see the comments at the start of this file for how to download
|
||||||
|
// the .onnx file and .wav files below
|
||||||
|
const char *segmentation_model =
|
||||||
|
"./sherpa-onnx-pyannote-segmentation-3-0/model.onnx";
|
||||||
|
|
||||||
|
const char *embedding_extractor_model =
|
||||||
|
"./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx";
|
||||||
|
|
||||||
|
const char *wav_filename = "./0-four-speakers-zh.wav";
|
||||||
|
|
||||||
|
const SherpaOnnxWave *wave = SherpaOnnxReadWave(wav_filename);
|
||||||
|
if (wave == NULL) {
|
||||||
|
fprintf(stderr, "Failed to read %s\n", wav_filename);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
SherpaOnnxOfflineSpeakerDiarizationConfig config;
|
||||||
|
memset(&config, 0, sizeof(config));
|
||||||
|
|
||||||
|
config.segmentation.pyannote.model = segmentation_model;
|
||||||
|
config.embedding.model = embedding_extractor_model;
|
||||||
|
|
||||||
|
// the test wave ./0-four-speakers-zh.wav has 4 speakers, so
|
||||||
|
// we set num_clusters to 4
|
||||||
|
//
|
||||||
|
config.clustering.num_clusters = 4;
|
||||||
|
// If you don't know the number of speakers in the test wave file, please
|
||||||
|
// use
|
||||||
|
// config.clustering.threshold = 0.5; // You need to tune this threshold
|
||||||
|
|
||||||
|
const SherpaOnnxOfflineSpeakerDiarization *sd =
|
||||||
|
SherpaOnnxCreateOfflineSpeakerDiarization(&config);
|
||||||
|
|
||||||
|
if (!sd) {
|
||||||
|
fprintf(stderr, "Failed to initialize offline speaker diarization\n");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (SherpaOnnxOfflineSpeakerDiarizationGetSampleRate(sd) !=
|
||||||
|
wave->sample_rate) {
|
||||||
|
fprintf(
|
||||||
|
stderr,
|
||||||
|
"Expected sample rate: %d. Actual sample rate from the wave file: %d\n",
|
||||||
|
SherpaOnnxOfflineSpeakerDiarizationGetSampleRate(sd),
|
||||||
|
wave->sample_rate);
|
||||||
|
goto failed;
|
||||||
|
}
|
||||||
|
|
||||||
|
const SherpaOnnxOfflineSpeakerDiarizationResult *result =
|
||||||
|
SherpaOnnxOfflineSpeakerDiarizationProcessWithCallback(
|
||||||
|
sd, wave->samples, wave->num_samples, ProgressCallback, NULL);
|
||||||
|
if (!result) {
|
||||||
|
fprintf(stderr, "Failed to do speaker diarization");
|
||||||
|
goto failed;
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t num_segments =
|
||||||
|
SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments(result);
|
||||||
|
|
||||||
|
const SherpaOnnxOfflineSpeakerDiarizationSegment *segments =
|
||||||
|
SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime(result);
|
||||||
|
|
||||||
|
for (int32_t i = 0; i != num_segments; ++i) {
|
||||||
|
fprintf(stderr, "%.3f -- %.3f speaker_%02d\n", segments[i].start,
|
||||||
|
segments[i].end, segments[i].speaker);
|
||||||
|
}
|
||||||
|
|
||||||
|
failed:
|
||||||
|
|
||||||
|
SherpaOnnxOfflineSpeakerDiarizationDestroySegment(segments);
|
||||||
|
SherpaOnnxOfflineSpeakerDiarizationDestroyResult(result);
|
||||||
|
SherpaOnnxDestroyOfflineSpeakerDiarization(sd);
|
||||||
|
SherpaOnnxFreeWave(wave);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
@@ -31,6 +31,10 @@
|
|||||||
#include "sherpa-onnx/csrc/offline-tts.h"
|
#include "sherpa-onnx/csrc/offline-tts.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if SHERPA_ONNX_ENABLE_SPEAKER_DIARIZATION == 1
|
||||||
|
#include "sherpa-onnx/csrc/offline-speaker-diarization.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
struct SherpaOnnxOnlineRecognizer {
|
struct SherpaOnnxOnlineRecognizer {
|
||||||
std::unique_ptr<sherpa_onnx::OnlineRecognizer> impl;
|
std::unique_ptr<sherpa_onnx::OnlineRecognizer> impl;
|
||||||
};
|
};
|
||||||
@@ -1670,3 +1674,144 @@ void SherpaOnnxLinearResamplerReset(SherpaOnnxLinearResampler *p) {
|
|||||||
int32_t SherpaOnnxFileExists(const char *filename) {
|
int32_t SherpaOnnxFileExists(const char *filename) {
|
||||||
return sherpa_onnx::FileExists(filename);
|
return sherpa_onnx::FileExists(filename);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if SHERPA_ONNX_ENABLE_SPEAKER_DIARIZATION == 1
|
||||||
|
|
||||||
|
struct SherpaOnnxOfflineSpeakerDiarization {
|
||||||
|
std::unique_ptr<sherpa_onnx::OfflineSpeakerDiarization> impl;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct SherpaOnnxOfflineSpeakerDiarizationResult {
|
||||||
|
sherpa_onnx::OfflineSpeakerDiarizationResult impl;
|
||||||
|
};
|
||||||
|
|
||||||
|
const SherpaOnnxOfflineSpeakerDiarization *
|
||||||
|
SherpaOnnxCreateOfflineSpeakerDiarization(
|
||||||
|
const SherpaOnnxOfflineSpeakerDiarizationConfig *config) {
|
||||||
|
sherpa_onnx::OfflineSpeakerDiarizationConfig sd_config;
|
||||||
|
|
||||||
|
sd_config.segmentation.pyannote.model =
|
||||||
|
SHERPA_ONNX_OR(config->segmentation.pyannote.model, "");
|
||||||
|
sd_config.segmentation.num_threads =
|
||||||
|
SHERPA_ONNX_OR(config->segmentation.num_threads, 1);
|
||||||
|
sd_config.segmentation.debug = config->segmentation.debug;
|
||||||
|
sd_config.segmentation.provider =
|
||||||
|
SHERPA_ONNX_OR(config->segmentation.provider, "cpu");
|
||||||
|
if (sd_config.segmentation.provider.empty()) {
|
||||||
|
sd_config.segmentation.provider = "cpu";
|
||||||
|
}
|
||||||
|
|
||||||
|
sd_config.embedding.model = SHERPA_ONNX_OR(config->embedding.model, "");
|
||||||
|
sd_config.embedding.num_threads =
|
||||||
|
SHERPA_ONNX_OR(config->embedding.num_threads, 1);
|
||||||
|
sd_config.embedding.debug = config->embedding.debug;
|
||||||
|
sd_config.embedding.provider =
|
||||||
|
SHERPA_ONNX_OR(config->embedding.provider, "cpu");
|
||||||
|
if (sd_config.embedding.provider.empty()) {
|
||||||
|
sd_config.embedding.provider = "cpu";
|
||||||
|
}
|
||||||
|
|
||||||
|
sd_config.clustering.num_clusters =
|
||||||
|
SHERPA_ONNX_OR(config->clustering.num_clusters, -1);
|
||||||
|
|
||||||
|
sd_config.clustering.threshold =
|
||||||
|
SHERPA_ONNX_OR(config->clustering.threshold, 0.5);
|
||||||
|
|
||||||
|
sd_config.min_duration_on = SHERPA_ONNX_OR(config->min_duration_on, 0.3);
|
||||||
|
|
||||||
|
sd_config.min_duration_off = SHERPA_ONNX_OR(config->min_duration_off, 0.5);
|
||||||
|
|
||||||
|
if (!sd_config.Validate()) {
|
||||||
|
SHERPA_ONNX_LOGE("Errors in config");
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
SherpaOnnxOfflineSpeakerDiarization *sd =
|
||||||
|
new SherpaOnnxOfflineSpeakerDiarization;
|
||||||
|
|
||||||
|
sd->impl =
|
||||||
|
std::make_unique<sherpa_onnx::OfflineSpeakerDiarization>(sd_config);
|
||||||
|
|
||||||
|
if (sd_config.segmentation.debug || sd_config.embedding.debug) {
|
||||||
|
SHERPA_ONNX_LOGE("%s\n", sd_config.ToString().c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
return sd;
|
||||||
|
}
|
||||||
|
|
||||||
|
void SherpaOnnxDestroyOfflineSpeakerDiarization(
|
||||||
|
const SherpaOnnxOfflineSpeakerDiarization *sd) {
|
||||||
|
delete sd;
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t SherpaOnnxOfflineSpeakerDiarizationGetSampleRate(
|
||||||
|
const SherpaOnnxOfflineSpeakerDiarization *sd) {
|
||||||
|
return sd->impl->SampleRate();
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t SherpaOnnxOfflineSpeakerDiarizationResultGetNumSpeakers(
|
||||||
|
const SherpaOnnxOfflineSpeakerDiarizationResult *r) {
|
||||||
|
return r->impl.NumSpeakers();
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments(
|
||||||
|
const SherpaOnnxOfflineSpeakerDiarizationResult *r) {
|
||||||
|
return r->impl.NumSegments();
|
||||||
|
}
|
||||||
|
|
||||||
|
const SherpaOnnxOfflineSpeakerDiarizationSegment *
|
||||||
|
SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime(
|
||||||
|
const SherpaOnnxOfflineSpeakerDiarizationResult *r) {
|
||||||
|
if (r->impl.NumSegments() == 0) {
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto segments = r->impl.SortByStartTime();
|
||||||
|
|
||||||
|
int32_t n = segments.size();
|
||||||
|
SherpaOnnxOfflineSpeakerDiarizationSegment *ans =
|
||||||
|
new SherpaOnnxOfflineSpeakerDiarizationSegment[n];
|
||||||
|
|
||||||
|
for (int32_t i = 0; i != n; ++i) {
|
||||||
|
const auto &s = segments[i];
|
||||||
|
|
||||||
|
ans[i].start = s.Start();
|
||||||
|
ans[i].end = s.End();
|
||||||
|
ans[i].speaker = s.Speaker();
|
||||||
|
}
|
||||||
|
|
||||||
|
return ans;
|
||||||
|
}
|
||||||
|
|
||||||
|
void SherpaOnnxOfflineSpeakerDiarizationDestroySegment(
|
||||||
|
const SherpaOnnxOfflineSpeakerDiarizationSegment *s) {
|
||||||
|
delete[] s;
|
||||||
|
}
|
||||||
|
|
||||||
|
const SherpaOnnxOfflineSpeakerDiarizationResult *
|
||||||
|
SherpaOnnxOfflineSpeakerDiarizationProcess(
|
||||||
|
const SherpaOnnxOfflineSpeakerDiarization *sd, const float *samples,
|
||||||
|
int32_t n) {
|
||||||
|
auto ans = new SherpaOnnxOfflineSpeakerDiarizationResult;
|
||||||
|
ans->impl = sd->impl->Process(samples, n);
|
||||||
|
|
||||||
|
return ans;
|
||||||
|
}
|
||||||
|
|
||||||
|
void SherpaOnnxOfflineSpeakerDiarizationDestroyResult(
|
||||||
|
const SherpaOnnxOfflineSpeakerDiarizationResult *r) {
|
||||||
|
delete r;
|
||||||
|
}
|
||||||
|
|
||||||
|
const SherpaOnnxOfflineSpeakerDiarizationResult *
|
||||||
|
SherpaOnnxOfflineSpeakerDiarizationProcessWithCallback(
|
||||||
|
const SherpaOnnxOfflineSpeakerDiarization *sd, const float *samples,
|
||||||
|
int32_t n, SherpaOnnxOfflineSpeakerDiarizationProgressCallback callback,
|
||||||
|
void *arg) {
|
||||||
|
auto ans = new SherpaOnnxOfflineSpeakerDiarizationResult;
|
||||||
|
ans->impl = sd->impl->Process(samples, n, callback, arg);
|
||||||
|
|
||||||
|
return ans;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|||||||
@@ -927,7 +927,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTts SherpaOnnxOfflineTts;
|
|||||||
SHERPA_ONNX_API SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts(
|
SHERPA_ONNX_API SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts(
|
||||||
const SherpaOnnxOfflineTtsConfig *config);
|
const SherpaOnnxOfflineTtsConfig *config);
|
||||||
|
|
||||||
// Free the pointer returned by CreateOfflineTts()
|
// Free the pointer returned by SherpaOnnxCreateOfflineTts()
|
||||||
SHERPA_ONNX_API void SherpaOnnxDestroyOfflineTts(SherpaOnnxOfflineTts *tts);
|
SHERPA_ONNX_API void SherpaOnnxDestroyOfflineTts(SherpaOnnxOfflineTts *tts);
|
||||||
|
|
||||||
// Return the sample rate of the current TTS object
|
// Return the sample rate of the current TTS object
|
||||||
@@ -954,6 +954,11 @@ SherpaOnnxOfflineTtsGenerateWithCallback(
|
|||||||
const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid, float speed,
|
const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid, float speed,
|
||||||
SherpaOnnxGeneratedAudioCallback callback);
|
SherpaOnnxGeneratedAudioCallback callback);
|
||||||
|
|
||||||
|
const SherpaOnnxGeneratedAudio *
|
||||||
|
SherpaOnnxOfflineTtsGenerateWithProgressCallback(
|
||||||
|
const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid, float speed,
|
||||||
|
SherpaOnnxGeneratedAudioProgressCallback callback);
|
||||||
|
|
||||||
// Same as SherpaOnnxGeneratedAudioCallback but you can pass an additional
|
// Same as SherpaOnnxGeneratedAudioCallback but you can pass an additional
|
||||||
// `void* arg` to the callback.
|
// `void* arg` to the callback.
|
||||||
SHERPA_ONNX_API const SherpaOnnxGeneratedAudio *
|
SHERPA_ONNX_API const SherpaOnnxGeneratedAudio *
|
||||||
@@ -1384,6 +1389,115 @@ SHERPA_ONNX_API int32_t SherpaOnnxLinearResamplerResampleGetOutputSampleRate(
|
|||||||
// Return 1 if the file exists; return 0 if the file does not exist.
|
// Return 1 if the file exists; return 0 if the file does not exist.
|
||||||
SHERPA_ONNX_API int32_t SherpaOnnxFileExists(const char *filename);
|
SHERPA_ONNX_API int32_t SherpaOnnxFileExists(const char *filename);
|
||||||
|
|
||||||
|
// =========================================================================
|
||||||
|
// For offline speaker diarization (i.e., non-streaming speaker diarization)
|
||||||
|
// =========================================================================
|
||||||
|
SHERPA_ONNX_API typedef struct
|
||||||
|
SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig {
|
||||||
|
const char *model;
|
||||||
|
} SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig;
|
||||||
|
|
||||||
|
SHERPA_ONNX_API typedef struct SherpaOnnxOfflineSpeakerSegmentationModelConfig {
|
||||||
|
SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig pyannote;
|
||||||
|
int32_t num_threads; // 1
|
||||||
|
int32_t debug; // false
|
||||||
|
const char *provider; // "cpu"
|
||||||
|
} SherpaOnnxOfflineSpeakerSegmentationModelConfig;
|
||||||
|
|
||||||
|
SHERPA_ONNX_API typedef struct SherpaOnnxFastClusteringConfig {
|
||||||
|
// If greater than 0, then threshold is ignored.
|
||||||
|
//
|
||||||
|
// We strongly recommend that you set it if you know the number of clusters
|
||||||
|
// in advance
|
||||||
|
int32_t num_clusters;
|
||||||
|
|
||||||
|
// distance threshold.
|
||||||
|
//
|
||||||
|
// The smaller, the more clusters it will generate.
|
||||||
|
// The larger, the fewer clusters it will generate.
|
||||||
|
float threshold;
|
||||||
|
} SherpaOnnxFastClusteringConfig;
|
||||||
|
|
||||||
|
SHERPA_ONNX_API typedef struct SherpaOnnxOfflineSpeakerDiarizationConfig {
|
||||||
|
SherpaOnnxOfflineSpeakerSegmentationModelConfig segmentation;
|
||||||
|
SherpaOnnxSpeakerEmbeddingExtractorConfig embedding;
|
||||||
|
SherpaOnnxFastClusteringConfig clustering;
|
||||||
|
|
||||||
|
// if a segment is less than this value, then it is discarded
|
||||||
|
float min_duration_on; // in seconds
|
||||||
|
|
||||||
|
// if the gap between to segments of the same speaker is less than this value,
|
||||||
|
// then these two segments are merged into a single segment.
|
||||||
|
// We do this recursively.
|
||||||
|
float min_duration_off; // in seconds
|
||||||
|
} SherpaOnnxOfflineSpeakerDiarizationConfig;
|
||||||
|
|
||||||
|
SHERPA_ONNX_API typedef struct SherpaOnnxOfflineSpeakerDiarization
|
||||||
|
SherpaOnnxOfflineSpeakerDiarization;
|
||||||
|
|
||||||
|
// The users has to invoke SherpaOnnxDestroyOfflineSpeakerDiarization()
|
||||||
|
// to free the returned pointer to avoid memory leak
|
||||||
|
SHERPA_ONNX_API const SherpaOnnxOfflineSpeakerDiarization *
|
||||||
|
SherpaOnnxCreateOfflineSpeakerDiarization(
|
||||||
|
const SherpaOnnxOfflineSpeakerDiarizationConfig *config);
|
||||||
|
|
||||||
|
// Free the pointer returned by SherpaOnnxCreateOfflineSpeakerDiarization()
|
||||||
|
SHERPA_ONNX_API void SherpaOnnxDestroyOfflineSpeakerDiarization(
|
||||||
|
const SherpaOnnxOfflineSpeakerDiarization *sd);
|
||||||
|
|
||||||
|
// Expected sample rate of the input audio samples
|
||||||
|
SHERPA_ONNX_API int32_t SherpaOnnxOfflineSpeakerDiarizationGetSampleRate(
|
||||||
|
const SherpaOnnxOfflineSpeakerDiarization *sd);
|
||||||
|
|
||||||
|
SHERPA_ONNX_API typedef struct SherpaOnnxOfflineSpeakerDiarizationResult
|
||||||
|
SherpaOnnxOfflineSpeakerDiarizationResult;
|
||||||
|
|
||||||
|
SHERPA_ONNX_API typedef struct SherpaOnnxOfflineSpeakerDiarizationSegment {
|
||||||
|
float start;
|
||||||
|
float end;
|
||||||
|
int32_t speaker;
|
||||||
|
} SherpaOnnxOfflineSpeakerDiarizationSegment;
|
||||||
|
|
||||||
|
SHERPA_ONNX_API int32_t SherpaOnnxOfflineSpeakerDiarizationResultGetNumSpeakers(
|
||||||
|
const SherpaOnnxOfflineSpeakerDiarizationResult *r);
|
||||||
|
|
||||||
|
SHERPA_ONNX_API int32_t SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments(
|
||||||
|
const SherpaOnnxOfflineSpeakerDiarizationResult *r);
|
||||||
|
|
||||||
|
// The user has to invoke SherpaOnnxOfflineSpeakerDiarizationDestroySegment()
|
||||||
|
// to free the returned pointer to avoid memory leak.
|
||||||
|
//
|
||||||
|
// The returned pointer is the start address of an array.
|
||||||
|
// Number of entries in the array equals to the value
|
||||||
|
// returned by SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments()
|
||||||
|
SHERPA_ONNX_API const SherpaOnnxOfflineSpeakerDiarizationSegment *
|
||||||
|
SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime(
|
||||||
|
const SherpaOnnxOfflineSpeakerDiarizationResult *r);
|
||||||
|
|
||||||
|
SHERPA_ONNX_API void SherpaOnnxOfflineSpeakerDiarizationDestroySegment(
|
||||||
|
const SherpaOnnxOfflineSpeakerDiarizationSegment *s);
|
||||||
|
|
||||||
|
typedef int32_t (*SherpaOnnxOfflineSpeakerDiarizationProgressCallback)(
|
||||||
|
int32_t num_processed_chunk, int32_t num_total_chunks, void *arg);
|
||||||
|
|
||||||
|
// The user has to invoke SherpaOnnxOfflineSpeakerDiarizationDestroyResult()
|
||||||
|
// to free the returned pointer to avoid memory leak.
|
||||||
|
SHERPA_ONNX_API const SherpaOnnxOfflineSpeakerDiarizationResult *
|
||||||
|
SherpaOnnxOfflineSpeakerDiarizationProcess(
|
||||||
|
const SherpaOnnxOfflineSpeakerDiarization *sd, const float *samples,
|
||||||
|
int32_t n);
|
||||||
|
|
||||||
|
// The user has to invoke SherpaOnnxOfflineSpeakerDiarizationDestroyResult()
|
||||||
|
// to free the returned pointer to avoid memory leak.
|
||||||
|
SHERPA_ONNX_API const SherpaOnnxOfflineSpeakerDiarizationResult *
|
||||||
|
SherpaOnnxOfflineSpeakerDiarizationProcessWithCallback(
|
||||||
|
const SherpaOnnxOfflineSpeakerDiarization *sd, const float *samples,
|
||||||
|
int32_t n, SherpaOnnxOfflineSpeakerDiarizationProgressCallback callback,
|
||||||
|
void *arg);
|
||||||
|
|
||||||
|
SHERPA_ONNX_API void SherpaOnnxOfflineSpeakerDiarizationDestroyResult(
|
||||||
|
const SherpaOnnxOfflineSpeakerDiarizationResult *r);
|
||||||
|
|
||||||
#if defined(__GNUC__)
|
#if defined(__GNUC__)
|
||||||
#pragma GCC diagnostic pop
|
#pragma GCC diagnostic pop
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -20,8 +20,8 @@ struct FastClusteringConfig {
|
|||||||
|
|
||||||
// distance threshold.
|
// distance threshold.
|
||||||
//
|
//
|
||||||
// The lower, the more clusters it will generate.
|
// The smaller, the more clusters it will generate.
|
||||||
// The higher, the fewer clusters it will generate.
|
// The larger, the fewer clusters it will generate.
|
||||||
float threshold = 0.5;
|
float threshold = 0.5;
|
||||||
|
|
||||||
FastClusteringConfig() = default;
|
FastClusteringConfig() = default;
|
||||||
|
|||||||
@@ -43,6 +43,16 @@ bool OfflineSpeakerDiarizationConfig::Validate() const {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (min_duration_on < 0) {
|
||||||
|
SHERPA_ONNX_LOGE("min_duration_on %.3f is negative", min_duration_on);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (min_duration_off < 0) {
|
||||||
|
SHERPA_ONNX_LOGE("min_duration_off %.3f is negative", min_duration_off);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -7,7 +7,7 @@
|
|||||||
#include "sherpa-onnx/csrc/wave-reader.h"
|
#include "sherpa-onnx/csrc/wave-reader.h"
|
||||||
|
|
||||||
static int32_t ProgressCallback(int32_t processed_chunks, int32_t num_chunks,
|
static int32_t ProgressCallback(int32_t processed_chunks, int32_t num_chunks,
|
||||||
void *arg) {
|
void *) {
|
||||||
float progress = 100.0 * processed_chunks / num_chunks;
|
float progress = 100.0 * processed_chunks / num_chunks;
|
||||||
fprintf(stderr, "progress %.2f%%\n", progress);
|
fprintf(stderr, "progress %.2f%%\n", progress);
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user