C API for speaker diarization (#1402)

This commit is contained in:
Fangjun Kuang
2024-10-09 17:10:03 +08:00
committed by GitHub
parent 8535b1d3bb
commit d468527f62
9 changed files with 418 additions and 7 deletions

View File

@@ -927,7 +927,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTts SherpaOnnxOfflineTts;
SHERPA_ONNX_API SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts(
const SherpaOnnxOfflineTtsConfig *config);
// Free the pointer returned by CreateOfflineTts()
// Free the pointer returned by SherpaOnnxCreateOfflineTts()
SHERPA_ONNX_API void SherpaOnnxDestroyOfflineTts(SherpaOnnxOfflineTts *tts);
// Return the sample rate of the current TTS object
@@ -954,6 +954,11 @@ SherpaOnnxOfflineTtsGenerateWithCallback(
const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid, float speed,
SherpaOnnxGeneratedAudioCallback callback);
const SherpaOnnxGeneratedAudio *
SherpaOnnxOfflineTtsGenerateWithProgressCallback(
const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid, float speed,
SherpaOnnxGeneratedAudioProgressCallback callback);
// Same as SherpaOnnxGeneratedAudioCallback but you can pass an additional
// `void* arg` to the callback.
SHERPA_ONNX_API const SherpaOnnxGeneratedAudio *
@@ -1384,6 +1389,115 @@ SHERPA_ONNX_API int32_t SherpaOnnxLinearResamplerResampleGetOutputSampleRate(
// Return 1 if the file exists; return 0 if the file does not exist.
SHERPA_ONNX_API int32_t SherpaOnnxFileExists(const char *filename);
// =========================================================================
// For offline speaker diarization (i.e., non-streaming speaker diarization)
// =========================================================================
SHERPA_ONNX_API typedef struct
SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig {
const char *model;
} SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig;
SHERPA_ONNX_API typedef struct SherpaOnnxOfflineSpeakerSegmentationModelConfig {
SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig pyannote;
int32_t num_threads; // 1
int32_t debug; // false
const char *provider; // "cpu"
} SherpaOnnxOfflineSpeakerSegmentationModelConfig;
SHERPA_ONNX_API typedef struct SherpaOnnxFastClusteringConfig {
// If greater than 0, then threshold is ignored.
//
// We strongly recommend that you set it if you know the number of clusters
// in advance
int32_t num_clusters;
// distance threshold.
//
// The smaller, the more clusters it will generate.
// The larger, the fewer clusters it will generate.
float threshold;
} SherpaOnnxFastClusteringConfig;
SHERPA_ONNX_API typedef struct SherpaOnnxOfflineSpeakerDiarizationConfig {
SherpaOnnxOfflineSpeakerSegmentationModelConfig segmentation;
SherpaOnnxSpeakerEmbeddingExtractorConfig embedding;
SherpaOnnxFastClusteringConfig clustering;
// if a segment is less than this value, then it is discarded
float min_duration_on; // in seconds
// if the gap between to segments of the same speaker is less than this value,
// then these two segments are merged into a single segment.
// We do this recursively.
float min_duration_off; // in seconds
} SherpaOnnxOfflineSpeakerDiarizationConfig;
SHERPA_ONNX_API typedef struct SherpaOnnxOfflineSpeakerDiarization
SherpaOnnxOfflineSpeakerDiarization;
// The users has to invoke SherpaOnnxDestroyOfflineSpeakerDiarization()
// to free the returned pointer to avoid memory leak
SHERPA_ONNX_API const SherpaOnnxOfflineSpeakerDiarization *
SherpaOnnxCreateOfflineSpeakerDiarization(
const SherpaOnnxOfflineSpeakerDiarizationConfig *config);
// Free the pointer returned by SherpaOnnxCreateOfflineSpeakerDiarization()
SHERPA_ONNX_API void SherpaOnnxDestroyOfflineSpeakerDiarization(
const SherpaOnnxOfflineSpeakerDiarization *sd);
// Expected sample rate of the input audio samples
SHERPA_ONNX_API int32_t SherpaOnnxOfflineSpeakerDiarizationGetSampleRate(
const SherpaOnnxOfflineSpeakerDiarization *sd);
SHERPA_ONNX_API typedef struct SherpaOnnxOfflineSpeakerDiarizationResult
SherpaOnnxOfflineSpeakerDiarizationResult;
SHERPA_ONNX_API typedef struct SherpaOnnxOfflineSpeakerDiarizationSegment {
float start;
float end;
int32_t speaker;
} SherpaOnnxOfflineSpeakerDiarizationSegment;
SHERPA_ONNX_API int32_t SherpaOnnxOfflineSpeakerDiarizationResultGetNumSpeakers(
const SherpaOnnxOfflineSpeakerDiarizationResult *r);
SHERPA_ONNX_API int32_t SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments(
const SherpaOnnxOfflineSpeakerDiarizationResult *r);
// The user has to invoke SherpaOnnxOfflineSpeakerDiarizationDestroySegment()
// to free the returned pointer to avoid memory leak.
//
// The returned pointer is the start address of an array.
// Number of entries in the array equals to the value
// returned by SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments()
SHERPA_ONNX_API const SherpaOnnxOfflineSpeakerDiarizationSegment *
SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime(
const SherpaOnnxOfflineSpeakerDiarizationResult *r);
SHERPA_ONNX_API void SherpaOnnxOfflineSpeakerDiarizationDestroySegment(
const SherpaOnnxOfflineSpeakerDiarizationSegment *s);
typedef int32_t (*SherpaOnnxOfflineSpeakerDiarizationProgressCallback)(
int32_t num_processed_chunk, int32_t num_total_chunks, void *arg);
// The user has to invoke SherpaOnnxOfflineSpeakerDiarizationDestroyResult()
// to free the returned pointer to avoid memory leak.
SHERPA_ONNX_API const SherpaOnnxOfflineSpeakerDiarizationResult *
SherpaOnnxOfflineSpeakerDiarizationProcess(
const SherpaOnnxOfflineSpeakerDiarization *sd, const float *samples,
int32_t n);
// The user has to invoke SherpaOnnxOfflineSpeakerDiarizationDestroyResult()
// to free the returned pointer to avoid memory leak.
SHERPA_ONNX_API const SherpaOnnxOfflineSpeakerDiarizationResult *
SherpaOnnxOfflineSpeakerDiarizationProcessWithCallback(
const SherpaOnnxOfflineSpeakerDiarization *sd, const float *samples,
int32_t n, SherpaOnnxOfflineSpeakerDiarizationProgressCallback callback,
void *arg);
SHERPA_ONNX_API void SherpaOnnxOfflineSpeakerDiarizationDestroyResult(
const SherpaOnnxOfflineSpeakerDiarizationResult *r);
#if defined(__GNUC__)
#pragma GCC diagnostic pop
#endif