Add C API for speaker embedding extractor. (#711)
This commit is contained in:
15
.github/scripts/test-c-api.sh
vendored
15
.github/scripts/test-c-api.sh
vendored
@@ -1,6 +1,6 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
set -e
|
set -ex
|
||||||
|
|
||||||
log() {
|
log() {
|
||||||
# This function is from espnet
|
# This function is from espnet
|
||||||
@@ -9,6 +9,7 @@ log() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
echo "SLID_EXE is $SLID_EXE"
|
echo "SLID_EXE is $SLID_EXE"
|
||||||
|
echo "SID_EXE is $SID_EXE"
|
||||||
echo "PATH: $PATH"
|
echo "PATH: $PATH"
|
||||||
|
|
||||||
|
|
||||||
@@ -24,3 +25,15 @@ rm sherpa-onnx-whisper-tiny.tar.bz2
|
|||||||
$SLID_EXE
|
$SLID_EXE
|
||||||
|
|
||||||
rm -rf sherpa-onnx-whisper-tiny*
|
rm -rf sherpa-onnx-whisper-tiny*
|
||||||
|
|
||||||
|
log "------------------------------------------------------------"
|
||||||
|
log "Download file for speaker identification and verification "
|
||||||
|
log "------------------------------------------------------------"
|
||||||
|
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_campplus_sv_zh-cn_16k-common.onnx
|
||||||
|
git clone https://github.com/csukuangfj/sr-data
|
||||||
|
|
||||||
|
$SID_EXE
|
||||||
|
|
||||||
|
rm -fv *.onnx
|
||||||
|
rm -rf sr-data
|
||||||
|
|||||||
3
.github/workflows/linux.yaml
vendored
3
.github/workflows/linux.yaml
vendored
@@ -124,11 +124,12 @@ jobs:
|
|||||||
name: release-${{ matrix.build_type }}-with-shared-lib-${{ matrix.shared_lib }}-with-tts-${{ matrix.with_tts }}
|
name: release-${{ matrix.build_type }}-with-shared-lib-${{ matrix.shared_lib }}-with-tts-${{ matrix.with_tts }}
|
||||||
path: build/bin/*
|
path: build/bin/*
|
||||||
|
|
||||||
- name: Test spoken language identification (C API)
|
- name: Test C API
|
||||||
shell: bash
|
shell: bash
|
||||||
run: |
|
run: |
|
||||||
export PATH=$PWD/build/bin:$PATH
|
export PATH=$PWD/build/bin:$PATH
|
||||||
export SLID_EXE=spoken-language-identification-c-api
|
export SLID_EXE=spoken-language-identification-c-api
|
||||||
|
export SID_EXE=speaker-identification-c-api
|
||||||
|
|
||||||
.github/scripts/test-c-api.sh
|
.github/scripts/test-c-api.sh
|
||||||
|
|
||||||
|
|||||||
3
.github/workflows/macos.yaml
vendored
3
.github/workflows/macos.yaml
vendored
@@ -103,11 +103,12 @@ jobs:
|
|||||||
otool -L build/bin/sherpa-onnx
|
otool -L build/bin/sherpa-onnx
|
||||||
otool -l build/bin/sherpa-onnx
|
otool -l build/bin/sherpa-onnx
|
||||||
|
|
||||||
- name: Test spoken language identification (C API)
|
- name: Test C API
|
||||||
shell: bash
|
shell: bash
|
||||||
run: |
|
run: |
|
||||||
export PATH=$PWD/build/bin:$PATH
|
export PATH=$PWD/build/bin:$PATH
|
||||||
export SLID_EXE=spoken-language-identification-c-api
|
export SLID_EXE=spoken-language-identification-c-api
|
||||||
|
export SID_EXE=speaker-identification-c-api
|
||||||
|
|
||||||
.github/scripts/test-c-api.sh
|
.github/scripts/test-c-api.sh
|
||||||
|
|
||||||
|
|||||||
3
.github/workflows/windows-x64.yaml
vendored
3
.github/workflows/windows-x64.yaml
vendored
@@ -70,11 +70,12 @@ jobs:
|
|||||||
|
|
||||||
ls -lh ./bin/Release/sherpa-onnx.exe
|
ls -lh ./bin/Release/sherpa-onnx.exe
|
||||||
|
|
||||||
- name: Test spoken language identification (C API)
|
- name: Test C API
|
||||||
shell: bash
|
shell: bash
|
||||||
run: |
|
run: |
|
||||||
export PATH=$PWD/build/bin/Release:$PATH
|
export PATH=$PWD/build/bin/Release:$PATH
|
||||||
export SLID_EXE=spoken-language-identification-c-api.exe
|
export SLID_EXE=spoken-language-identification-c-api.exe
|
||||||
|
export SID_EXE=speaker-identification-c-api.exe
|
||||||
|
|
||||||
.github/scripts/test-c-api.sh
|
.github/scripts/test-c-api.sh
|
||||||
|
|
||||||
|
|||||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -87,3 +87,4 @@ vits-coqui-*
|
|||||||
vits-mms-*
|
vits-mms-*
|
||||||
*.tar.bz2
|
*.tar.bz2
|
||||||
sherpa-onnx-paraformer-trilingual-zh-cantonese-en
|
sherpa-onnx-paraformer-trilingual-zh-cantonese-en
|
||||||
|
sr-data
|
||||||
|
|||||||
@@ -12,6 +12,9 @@ endif()
|
|||||||
add_executable(spoken-language-identification-c-api spoken-language-identification-c-api.c)
|
add_executable(spoken-language-identification-c-api spoken-language-identification-c-api.c)
|
||||||
target_link_libraries(spoken-language-identification-c-api sherpa-onnx-c-api)
|
target_link_libraries(spoken-language-identification-c-api sherpa-onnx-c-api)
|
||||||
|
|
||||||
|
add_executable(speaker-identification-c-api speaker-identification-c-api.c)
|
||||||
|
target_link_libraries(speaker-identification-c-api sherpa-onnx-c-api)
|
||||||
|
|
||||||
if(SHERPA_ONNX_HAS_ALSA)
|
if(SHERPA_ONNX_HAS_ALSA)
|
||||||
add_subdirectory(./asr-microphone-example)
|
add_subdirectory(./asr-microphone-example)
|
||||||
elseif((UNIX AND NOT APPLE) OR LINUX)
|
elseif((UNIX AND NOT APPLE) OR LINUX)
|
||||||
|
|||||||
@@ -188,10 +188,11 @@ int32_t main(int32_t argc, char *argv[]) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
SherpaOnnxOnlineRecognizer *recognizer = CreateOnlineRecognizer(&config);
|
const SherpaOnnxOnlineRecognizer *recognizer =
|
||||||
SherpaOnnxOnlineStream *stream = CreateOnlineStream(recognizer);
|
CreateOnlineRecognizer(&config);
|
||||||
|
const SherpaOnnxOnlineStream *stream = CreateOnlineStream(recognizer);
|
||||||
|
|
||||||
SherpaOnnxDisplay *display = CreateDisplay(50);
|
const SherpaOnnxDisplay *display = CreateDisplay(50);
|
||||||
int32_t segment_id = 0;
|
int32_t segment_id = 0;
|
||||||
|
|
||||||
const char *device_name = argv[context.index];
|
const char *device_name = argv[context.index];
|
||||||
|
|||||||
@@ -162,10 +162,11 @@ int32_t main(int32_t argc, char *argv[]) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
SherpaOnnxOnlineRecognizer *recognizer = CreateOnlineRecognizer(&config);
|
const SherpaOnnxOnlineRecognizer *recognizer =
|
||||||
SherpaOnnxOnlineStream *stream = CreateOnlineStream(recognizer);
|
CreateOnlineRecognizer(&config);
|
||||||
|
const SherpaOnnxOnlineStream *stream = CreateOnlineStream(recognizer);
|
||||||
|
|
||||||
SherpaOnnxDisplay *display = CreateDisplay(50);
|
const SherpaOnnxDisplay *display = CreateDisplay(50);
|
||||||
int32_t segment_id = 0;
|
int32_t segment_id = 0;
|
||||||
|
|
||||||
const char *wav_filename = argv[context.index];
|
const char *wav_filename = argv[context.index];
|
||||||
|
|||||||
256
c-api-examples/speaker-identification-c-api.c
Normal file
256
c-api-examples/speaker-identification-c-api.c
Normal file
@@ -0,0 +1,256 @@
|
|||||||
|
// c-api-examples/speaker-identification-c-api.c
|
||||||
|
//
|
||||||
|
// Copyright (c) 2024 Xiaomi Corporation
|
||||||
|
|
||||||
|
// We assume you have pre-downloaded the speaker embedding extractor model
|
||||||
|
// from
|
||||||
|
// https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models
|
||||||
|
//
|
||||||
|
// An example command to download
|
||||||
|
// "3dspeaker_speech_campplus_sv_zh-cn_16k-common.onnx"
|
||||||
|
// is given below:
|
||||||
|
//
|
||||||
|
// clang-format off
|
||||||
|
//
|
||||||
|
// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_campplus_sv_zh-cn_16k-common.onnx
|
||||||
|
//
|
||||||
|
// clang-format on
|
||||||
|
//
|
||||||
|
// Also, please download the test wave files from
|
||||||
|
//
|
||||||
|
// https://github.com/csukuangfj/sr-data
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
#include "sherpa-onnx/c-api/c-api.h"
|
||||||
|
|
||||||
|
static const float *ComputeEmbedding(
|
||||||
|
const SherpaOnnxSpeakerEmbeddingExtractor *ex, const char *wav_filename) {
|
||||||
|
const SherpaOnnxWave *wave = SherpaOnnxReadWave(wav_filename);
|
||||||
|
if (wave == NULL) {
|
||||||
|
fprintf(stderr, "Failed to read %s\n", wav_filename);
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
const SherpaOnnxOnlineStream *stream =
|
||||||
|
SherpaOnnxSpeakerEmbeddingExtractorCreateStream(ex);
|
||||||
|
|
||||||
|
AcceptWaveform(stream, wave->sample_rate, wave->samples, wave->num_samples);
|
||||||
|
InputFinished(stream);
|
||||||
|
|
||||||
|
if (!SherpaOnnxSpeakerEmbeddingExtractorIsReady(ex, stream)) {
|
||||||
|
fprintf(stderr, "The input wave file %s is too short!\n", wav_filename);
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// we will free `v` outside of this function
|
||||||
|
const float *v =
|
||||||
|
SherpaOnnxSpeakerEmbeddingExtractorComputeEmbedding(ex, stream);
|
||||||
|
|
||||||
|
DestroyOnlineStream(stream);
|
||||||
|
SherpaOnnxFreeWave(wave);
|
||||||
|
|
||||||
|
// Remeber to free v to avoid memory leak
|
||||||
|
return v;
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t main() {
|
||||||
|
SherpaOnnxSpeakerEmbeddingExtractorConfig config;
|
||||||
|
|
||||||
|
memset(&config, 0, sizeof(config));
|
||||||
|
|
||||||
|
// please download the model from
|
||||||
|
// https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models
|
||||||
|
config.model = "./3dspeaker_speech_campplus_sv_zh-cn_16k-common.onnx";
|
||||||
|
|
||||||
|
config.num_threads = 1;
|
||||||
|
config.debug = 0;
|
||||||
|
config.provider = "cpu";
|
||||||
|
|
||||||
|
const SherpaOnnxSpeakerEmbeddingExtractor *ex =
|
||||||
|
SherpaOnnxCreateSpeakerEmbeddingExtractor(&config);
|
||||||
|
if (!ex) {
|
||||||
|
fprintf(stderr, "Failed to create speaker embedding extractor");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t dim = SherpaOnnxSpeakerEmbeddingExtractorDim(ex);
|
||||||
|
|
||||||
|
const SherpaOnnxSpeakerEmbeddingManager *manager =
|
||||||
|
SherpaOnnxCreateSpeakerEmbeddingManager(dim);
|
||||||
|
|
||||||
|
// Please download the test data from
|
||||||
|
// https://github.com/csukuangfj/sr-data
|
||||||
|
const char *spk1_1 = "./sr-data/enroll/fangjun-sr-1.wav";
|
||||||
|
const char *spk1_2 = "./sr-data/enroll/fangjun-sr-2.wav";
|
||||||
|
const char *spk1_3 = "./sr-data/enroll/fangjun-sr-3.wav";
|
||||||
|
|
||||||
|
const char *spk2_1 = "./sr-data/enroll/leijun-sr-1.wav";
|
||||||
|
const char *spk2_2 = "./sr-data/enroll/leijun-sr-2.wav";
|
||||||
|
|
||||||
|
const float *spk1_vec[4] = {NULL};
|
||||||
|
spk1_vec[0] = ComputeEmbedding(ex, spk1_1);
|
||||||
|
spk1_vec[1] = ComputeEmbedding(ex, spk1_2);
|
||||||
|
spk1_vec[2] = ComputeEmbedding(ex, spk1_3);
|
||||||
|
|
||||||
|
const float *spk2_vec[3] = {NULL};
|
||||||
|
spk2_vec[0] = ComputeEmbedding(ex, spk2_1);
|
||||||
|
spk2_vec[1] = ComputeEmbedding(ex, spk2_2);
|
||||||
|
|
||||||
|
if (!SherpaOnnxSpeakerEmbeddingManagerAddList(manager, "fangjun", spk1_vec)) {
|
||||||
|
fprintf(stderr, "Failed to register fangjun\n");
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!SherpaOnnxSpeakerEmbeddingManagerContains(manager, "fangjun")) {
|
||||||
|
fprintf(stderr, "Failed to find fangjun\n");
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!SherpaOnnxSpeakerEmbeddingManagerAddList(manager, "leijun", spk2_vec)) {
|
||||||
|
fprintf(stderr, "Failed to register leijun\n");
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!SherpaOnnxSpeakerEmbeddingManagerContains(manager, "leijun")) {
|
||||||
|
fprintf(stderr, "Failed to find leijun\n");
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (SherpaOnnxSpeakerEmbeddingManagerNumSpeakers(manager) != 2) {
|
||||||
|
fprintf(stderr, "There should be two speakers: fangjun and leijun\n");
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
const char *const *all_speakers =
|
||||||
|
SherpaOnnxSpeakerEmbeddingManagerGetAllSpeakers(manager);
|
||||||
|
const char *const *p = all_speakers;
|
||||||
|
fprintf(stderr, "list of registered speakers\n-----\n");
|
||||||
|
while (p[0]) {
|
||||||
|
fprintf(stderr, "speaker: %s\n", p[0]);
|
||||||
|
++p;
|
||||||
|
}
|
||||||
|
fprintf(stderr, "----\n");
|
||||||
|
|
||||||
|
SherpaOnnxSpeakerEmbeddingManagerFreeAllSpeakers(all_speakers);
|
||||||
|
|
||||||
|
const char *test1 = "./sr-data/test/fangjun-test-sr-1.wav";
|
||||||
|
const char *test2 = "./sr-data/test/leijun-test-sr-1.wav";
|
||||||
|
const char *test3 = "./sr-data/test/liudehua-test-sr-1.wav";
|
||||||
|
|
||||||
|
const float *v1 = ComputeEmbedding(ex, test1);
|
||||||
|
const float *v2 = ComputeEmbedding(ex, test2);
|
||||||
|
const float *v3 = ComputeEmbedding(ex, test3);
|
||||||
|
|
||||||
|
float threshold = 0.6;
|
||||||
|
|
||||||
|
const char *name1 =
|
||||||
|
SherpaOnnxSpeakerEmbeddingManagerSearch(manager, v1, threshold);
|
||||||
|
if (name1) {
|
||||||
|
fprintf(stderr, "%s: Found %s\n", test1, name1);
|
||||||
|
SherpaOnnxSpeakerEmbeddingManagerFreeSearch(name1);
|
||||||
|
} else {
|
||||||
|
fprintf(stderr, "%s: Not found\n", test1);
|
||||||
|
}
|
||||||
|
|
||||||
|
const char *name2 =
|
||||||
|
SherpaOnnxSpeakerEmbeddingManagerSearch(manager, v2, threshold);
|
||||||
|
if (name2) {
|
||||||
|
fprintf(stderr, "%s: Found %s\n", test2, name2);
|
||||||
|
SherpaOnnxSpeakerEmbeddingManagerFreeSearch(name2);
|
||||||
|
} else {
|
||||||
|
fprintf(stderr, "%s: Not found\n", test2);
|
||||||
|
}
|
||||||
|
|
||||||
|
const char *name3 =
|
||||||
|
SherpaOnnxSpeakerEmbeddingManagerSearch(manager, v3, threshold);
|
||||||
|
if (name3) {
|
||||||
|
fprintf(stderr, "%s: Found %s\n", test3, name3);
|
||||||
|
SherpaOnnxSpeakerEmbeddingManagerFreeSearch(name3);
|
||||||
|
} else {
|
||||||
|
fprintf(stderr, "%s: Not found\n", test3);
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t ok = SherpaOnnxSpeakerEmbeddingManagerVerify(manager, "fangjun", v1,
|
||||||
|
threshold);
|
||||||
|
if (ok) {
|
||||||
|
fprintf(stderr, "%s matches fangjun\n", test1);
|
||||||
|
} else {
|
||||||
|
fprintf(stderr, "%s does NOT match fangjun\n", test1);
|
||||||
|
}
|
||||||
|
|
||||||
|
ok = SherpaOnnxSpeakerEmbeddingManagerVerify(manager, "fangjun", v2,
|
||||||
|
threshold);
|
||||||
|
if (ok) {
|
||||||
|
fprintf(stderr, "%s matches fangjun\n", test2);
|
||||||
|
} else {
|
||||||
|
fprintf(stderr, "%s does NOT match fangjun\n", test2);
|
||||||
|
}
|
||||||
|
|
||||||
|
fprintf(stderr, "Removing fangjun\n");
|
||||||
|
if (!SherpaOnnxSpeakerEmbeddingManagerRemove(manager, "fangjun")) {
|
||||||
|
fprintf(stderr, "Failed to remove fangjun\n");
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (SherpaOnnxSpeakerEmbeddingManagerNumSpeakers(manager) != 1) {
|
||||||
|
fprintf(stderr, "There should be only 1 speaker left\n");
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
name1 = SherpaOnnxSpeakerEmbeddingManagerSearch(manager, v1, threshold);
|
||||||
|
if (name1) {
|
||||||
|
fprintf(stderr, "%s: Found %s\n", test1, name1);
|
||||||
|
SherpaOnnxSpeakerEmbeddingManagerFreeSearch(name1);
|
||||||
|
} else {
|
||||||
|
fprintf(stderr, "%s: Not found\n", test1);
|
||||||
|
}
|
||||||
|
|
||||||
|
fprintf(stderr, "Removing leijun\n");
|
||||||
|
if (!SherpaOnnxSpeakerEmbeddingManagerRemove(manager, "leijun")) {
|
||||||
|
fprintf(stderr, "Failed to remove leijun\n");
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (SherpaOnnxSpeakerEmbeddingManagerNumSpeakers(manager) != 0) {
|
||||||
|
fprintf(stderr, "There should be only 1 speaker left\n");
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
name2 = SherpaOnnxSpeakerEmbeddingManagerSearch(manager, v2, threshold);
|
||||||
|
if (name2) {
|
||||||
|
fprintf(stderr, "%s: Found %s\n", test2, name2);
|
||||||
|
SherpaOnnxSpeakerEmbeddingManagerFreeSearch(name2);
|
||||||
|
} else {
|
||||||
|
fprintf(stderr, "%s: Not found\n", test2);
|
||||||
|
}
|
||||||
|
|
||||||
|
all_speakers = SherpaOnnxSpeakerEmbeddingManagerGetAllSpeakers(manager);
|
||||||
|
|
||||||
|
p = all_speakers;
|
||||||
|
fprintf(stderr, "list of registered speakers\n-----\n");
|
||||||
|
while (p[0]) {
|
||||||
|
fprintf(stderr, "speaker: %s\n", p[0]);
|
||||||
|
++p;
|
||||||
|
}
|
||||||
|
fprintf(stderr, "----\n");
|
||||||
|
|
||||||
|
SherpaOnnxSpeakerEmbeddingManagerFreeAllSpeakers(all_speakers);
|
||||||
|
SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(v1);
|
||||||
|
SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(v2);
|
||||||
|
SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(v3);
|
||||||
|
|
||||||
|
SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(spk1_vec[0]);
|
||||||
|
SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(spk1_vec[1]);
|
||||||
|
SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(spk1_vec[2]);
|
||||||
|
|
||||||
|
SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(spk2_vec[0]);
|
||||||
|
SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(spk2_vec[1]);
|
||||||
|
|
||||||
|
SherpaOnnxDestroySpeakerEmbeddingManager(manager);
|
||||||
|
SherpaOnnxDestroySpeakerEmbeddingExtractor(ex);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
@@ -1,3 +1,6 @@
|
|||||||
|
// c-api-examples/spoken-language-identification-c-api.c
|
||||||
|
//
|
||||||
|
// Copyright (c) 2024 Xiaomi Corporation
|
||||||
|
|
||||||
// We assume you have pre-downloaded the whisper multi-lingual models
|
// We assume you have pre-downloaded the whisper multi-lingual models
|
||||||
// from https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
|
// from https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
|
||||||
|
|||||||
@@ -83,7 +83,7 @@ class ViewController: UIViewController {
|
|||||||
// Please select one model that is best suitable for you.
|
// Please select one model that is best suitable for you.
|
||||||
//
|
//
|
||||||
// You can also modify Model.swift to add new pre-trained models from
|
// You can also modify Model.swift to add new pre-trained models from
|
||||||
// https://k2-fsa.github.io/sherpa/ncnn/pretrained_models/index.html
|
// https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html
|
||||||
|
|
||||||
// let modelConfig = getBilingualStreamZhEnZipformer20230220()
|
// let modelConfig = getBilingualStreamZhEnZipformer20230220()
|
||||||
// let modelConfig = getZhZipformer20230615()
|
// let modelConfig = getZhZipformer20230615()
|
||||||
|
|||||||
@@ -4,7 +4,7 @@
|
|||||||
//
|
//
|
||||||
// Created by fangjun on 2023/11/23.
|
// Created by fangjun on 2023/11/23.
|
||||||
//
|
//
|
||||||
// Speech-to-text with Next-gen Kaldi on iOS without Internet connection
|
// Text-to-speech with Next-gen Kaldi on iOS without Internet connection
|
||||||
|
|
||||||
import SwiftUI
|
import SwiftUI
|
||||||
import AVFoundation
|
import AVFoundation
|
||||||
|
|||||||
@@ -183,7 +183,7 @@ event = threading.Event()
|
|||||||
first_message_time = None
|
first_message_time = None
|
||||||
|
|
||||||
|
|
||||||
def generated_audio_callback(samples: np.ndarray):
|
def generated_audio_callback(samples: np.ndarray, progress: float):
|
||||||
"""This function is called whenever max_num_sentences sentences
|
"""This function is called whenever max_num_sentences sentences
|
||||||
have been processed.
|
have been processed.
|
||||||
|
|
||||||
|
|||||||
@@ -16,6 +16,8 @@
|
|||||||
#include "sherpa-onnx/csrc/macros.h"
|
#include "sherpa-onnx/csrc/macros.h"
|
||||||
#include "sherpa-onnx/csrc/offline-recognizer.h"
|
#include "sherpa-onnx/csrc/offline-recognizer.h"
|
||||||
#include "sherpa-onnx/csrc/online-recognizer.h"
|
#include "sherpa-onnx/csrc/online-recognizer.h"
|
||||||
|
#include "sherpa-onnx/csrc/speaker-embedding-extractor.h"
|
||||||
|
#include "sherpa-onnx/csrc/speaker-embedding-manager.h"
|
||||||
#include "sherpa-onnx/csrc/spoken-language-identification.h"
|
#include "sherpa-onnx/csrc/spoken-language-identification.h"
|
||||||
#include "sherpa-onnx/csrc/voice-activity-detector.h"
|
#include "sherpa-onnx/csrc/voice-activity-detector.h"
|
||||||
#include "sherpa-onnx/csrc/wave-reader.h"
|
#include "sherpa-onnx/csrc/wave-reader.h"
|
||||||
@@ -114,7 +116,7 @@ SherpaOnnxOnlineRecognizer *CreateOnlineRecognizer(
|
|||||||
return recognizer;
|
return recognizer;
|
||||||
}
|
}
|
||||||
|
|
||||||
void DestroyOnlineRecognizer(SherpaOnnxOnlineRecognizer *recognizer) {
|
void DestroyOnlineRecognizer(const SherpaOnnxOnlineRecognizer *recognizer) {
|
||||||
delete recognizer;
|
delete recognizer;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -132,25 +134,28 @@ SherpaOnnxOnlineStream *CreateOnlineStreamWithHotwords(
|
|||||||
return stream;
|
return stream;
|
||||||
}
|
}
|
||||||
|
|
||||||
void DestroyOnlineStream(SherpaOnnxOnlineStream *stream) { delete stream; }
|
void DestroyOnlineStream(const SherpaOnnxOnlineStream *stream) {
|
||||||
|
delete stream;
|
||||||
|
}
|
||||||
|
|
||||||
void AcceptWaveform(SherpaOnnxOnlineStream *stream, int32_t sample_rate,
|
void AcceptWaveform(const SherpaOnnxOnlineStream *stream, int32_t sample_rate,
|
||||||
const float *samples, int32_t n) {
|
const float *samples, int32_t n) {
|
||||||
stream->impl->AcceptWaveform(sample_rate, samples, n);
|
stream->impl->AcceptWaveform(sample_rate, samples, n);
|
||||||
}
|
}
|
||||||
|
|
||||||
int32_t IsOnlineStreamReady(SherpaOnnxOnlineRecognizer *recognizer,
|
int32_t IsOnlineStreamReady(const SherpaOnnxOnlineRecognizer *recognizer,
|
||||||
SherpaOnnxOnlineStream *stream) {
|
const SherpaOnnxOnlineStream *stream) {
|
||||||
return recognizer->impl->IsReady(stream->impl.get());
|
return recognizer->impl->IsReady(stream->impl.get());
|
||||||
}
|
}
|
||||||
|
|
||||||
void DecodeOnlineStream(SherpaOnnxOnlineRecognizer *recognizer,
|
void DecodeOnlineStream(const SherpaOnnxOnlineRecognizer *recognizer,
|
||||||
SherpaOnnxOnlineStream *stream) {
|
const SherpaOnnxOnlineStream *stream) {
|
||||||
recognizer->impl->DecodeStream(stream->impl.get());
|
recognizer->impl->DecodeStream(stream->impl.get());
|
||||||
}
|
}
|
||||||
|
|
||||||
void DecodeMultipleOnlineStreams(SherpaOnnxOnlineRecognizer *recognizer,
|
void DecodeMultipleOnlineStreams(const SherpaOnnxOnlineRecognizer *recognizer,
|
||||||
SherpaOnnxOnlineStream **streams, int32_t n) {
|
const SherpaOnnxOnlineStream **streams,
|
||||||
|
int32_t n) {
|
||||||
std::vector<sherpa_onnx::OnlineStream *> ss(n);
|
std::vector<sherpa_onnx::OnlineStream *> ss(n);
|
||||||
for (int32_t i = 0; i != n; ++i) {
|
for (int32_t i = 0; i != n; ++i) {
|
||||||
ss[i] = streams[i]->impl.get();
|
ss[i] = streams[i]->impl.get();
|
||||||
@@ -159,7 +164,8 @@ void DecodeMultipleOnlineStreams(SherpaOnnxOnlineRecognizer *recognizer,
|
|||||||
}
|
}
|
||||||
|
|
||||||
const SherpaOnnxOnlineRecognizerResult *GetOnlineStreamResult(
|
const SherpaOnnxOnlineRecognizerResult *GetOnlineStreamResult(
|
||||||
SherpaOnnxOnlineRecognizer *recognizer, SherpaOnnxOnlineStream *stream) {
|
const SherpaOnnxOnlineRecognizer *recognizer,
|
||||||
|
const SherpaOnnxOnlineStream *stream) {
|
||||||
sherpa_onnx::OnlineRecognizerResult result =
|
sherpa_onnx::OnlineRecognizerResult result =
|
||||||
recognizer->impl->GetResult(stream->impl.get());
|
recognizer->impl->GetResult(stream->impl.get());
|
||||||
const auto &text = result.text;
|
const auto &text = result.text;
|
||||||
@@ -232,29 +238,30 @@ void DestroyOnlineRecognizerResult(const SherpaOnnxOnlineRecognizerResult *r) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void Reset(SherpaOnnxOnlineRecognizer *recognizer,
|
void Reset(const SherpaOnnxOnlineRecognizer *recognizer,
|
||||||
SherpaOnnxOnlineStream *stream) {
|
const SherpaOnnxOnlineStream *stream) {
|
||||||
recognizer->impl->Reset(stream->impl.get());
|
recognizer->impl->Reset(stream->impl.get());
|
||||||
}
|
}
|
||||||
|
|
||||||
void InputFinished(SherpaOnnxOnlineStream *stream) {
|
void InputFinished(const SherpaOnnxOnlineStream *stream) {
|
||||||
stream->impl->InputFinished();
|
stream->impl->InputFinished();
|
||||||
}
|
}
|
||||||
|
|
||||||
int32_t IsEndpoint(SherpaOnnxOnlineRecognizer *recognizer,
|
int32_t IsEndpoint(const SherpaOnnxOnlineRecognizer *recognizer,
|
||||||
SherpaOnnxOnlineStream *stream) {
|
const SherpaOnnxOnlineStream *stream) {
|
||||||
return recognizer->impl->IsEndpoint(stream->impl.get());
|
return recognizer->impl->IsEndpoint(stream->impl.get());
|
||||||
}
|
}
|
||||||
|
|
||||||
SherpaOnnxDisplay *CreateDisplay(int32_t max_word_per_line) {
|
const SherpaOnnxDisplay *CreateDisplay(int32_t max_word_per_line) {
|
||||||
SherpaOnnxDisplay *ans = new SherpaOnnxDisplay;
|
SherpaOnnxDisplay *ans = new SherpaOnnxDisplay;
|
||||||
ans->impl = std::make_unique<sherpa_onnx::Display>(max_word_per_line);
|
ans->impl = std::make_unique<sherpa_onnx::Display>(max_word_per_line);
|
||||||
return ans;
|
return ans;
|
||||||
}
|
}
|
||||||
|
|
||||||
void DestroyDisplay(SherpaOnnxDisplay *display) { delete display; }
|
void DestroyDisplay(const SherpaOnnxDisplay *display) { delete display; }
|
||||||
|
|
||||||
void SherpaOnnxPrint(SherpaOnnxDisplay *display, int32_t idx, const char *s) {
|
void SherpaOnnxPrint(const SherpaOnnxDisplay *display, int32_t idx,
|
||||||
|
const char *s) {
|
||||||
display->impl->Print(idx, s);
|
display->impl->Print(idx, s);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -808,9 +815,8 @@ int32_t SherpaOnnxOfflineTtsNumSpeakers(const SherpaOnnxOfflineTts *tts) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static const SherpaOnnxGeneratedAudio *SherpaOnnxOfflineTtsGenerateInternal(
|
static const SherpaOnnxGeneratedAudio *SherpaOnnxOfflineTtsGenerateInternal(
|
||||||
const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid,
|
const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid, float speed,
|
||||||
float speed, std::function<void(const float *, int32_t, float)> callback)
|
std::function<void(const float *, int32_t, float)> callback) {
|
||||||
{
|
|
||||||
sherpa_onnx::GeneratedAudio audio =
|
sherpa_onnx::GeneratedAudio audio =
|
||||||
tts->impl->Generate(text, sid, speed, callback);
|
tts->impl->Generate(text, sid, speed, callback);
|
||||||
|
|
||||||
@@ -833,36 +839,37 @@ static const SherpaOnnxGeneratedAudio *SherpaOnnxOfflineTtsGenerateInternal(
|
|||||||
const SherpaOnnxGeneratedAudio *SherpaOnnxOfflineTtsGenerate(
|
const SherpaOnnxGeneratedAudio *SherpaOnnxOfflineTtsGenerate(
|
||||||
const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid,
|
const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid,
|
||||||
float speed) {
|
float speed) {
|
||||||
return SherpaOnnxOfflineTtsGenerateInternal( tts, text, sid, speed, nullptr );
|
return SherpaOnnxOfflineTtsGenerateInternal(tts, text, sid, speed, nullptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
const SherpaOnnxGeneratedAudio *SherpaOnnxOfflineTtsGenerateWithCallback(
|
const SherpaOnnxGeneratedAudio *SherpaOnnxOfflineTtsGenerateWithCallback(
|
||||||
const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid, float speed,
|
const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid, float speed,
|
||||||
SherpaOnnxGeneratedAudioCallback callback) {
|
SherpaOnnxGeneratedAudioCallback callback) {
|
||||||
auto wrapper = [callback](const float *samples, int32_t n, float /*progress*/) {
|
auto wrapper = [callback](const float *samples, int32_t n,
|
||||||
callback(samples, n );
|
float /*progress*/) { callback(samples, n); };
|
||||||
};
|
|
||||||
|
|
||||||
return SherpaOnnxOfflineTtsGenerateInternal( tts, text, sid, speed, wrapper );
|
return SherpaOnnxOfflineTtsGenerateInternal(tts, text, sid, speed, wrapper);
|
||||||
}
|
}
|
||||||
|
|
||||||
const SherpaOnnxGeneratedAudio *SherpaOnnxOfflineTtsGenerateWithProgressCallback(
|
const SherpaOnnxGeneratedAudio *
|
||||||
|
SherpaOnnxOfflineTtsGenerateWithProgressCallback(
|
||||||
const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid, float speed,
|
const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid, float speed,
|
||||||
SherpaOnnxGeneratedAudioProgressCallback callback) {
|
SherpaOnnxGeneratedAudioProgressCallback callback) {
|
||||||
auto wrapper = [callback](const float *samples, int32_t n, float progress) {
|
auto wrapper = [callback](const float *samples, int32_t n, float progress) {
|
||||||
callback(samples, n, progress );
|
callback(samples, n, progress);
|
||||||
};
|
};
|
||||||
return SherpaOnnxOfflineTtsGenerateInternal( tts, text, sid, speed, wrapper );
|
return SherpaOnnxOfflineTtsGenerateInternal(tts, text, sid, speed, wrapper);
|
||||||
}
|
}
|
||||||
|
|
||||||
const SherpaOnnxGeneratedAudio *SherpaOnnxOfflineTtsGenerateWithCallbackWithArg(
|
const SherpaOnnxGeneratedAudio *SherpaOnnxOfflineTtsGenerateWithCallbackWithArg(
|
||||||
const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid, float speed,
|
const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid, float speed,
|
||||||
SherpaOnnxGeneratedAudioCallbackWithArg callback, void *arg) {
|
SherpaOnnxGeneratedAudioCallbackWithArg callback, void *arg) {
|
||||||
auto wrapper = [callback, arg](const float *samples, int32_t n, float /*progress*/) {
|
auto wrapper = [callback, arg](const float *samples, int32_t n,
|
||||||
|
float /*progress*/) {
|
||||||
callback(samples, n, arg);
|
callback(samples, n, arg);
|
||||||
};
|
};
|
||||||
|
|
||||||
return SherpaOnnxOfflineTtsGenerateInternal( tts, text, sid, speed, wrapper );
|
return SherpaOnnxOfflineTtsGenerateInternal(tts, text, sid, speed, wrapper);
|
||||||
}
|
}
|
||||||
|
|
||||||
void SherpaOnnxDestroyOfflineTtsGeneratedAudio(
|
void SherpaOnnxDestroyOfflineTtsGeneratedAudio(
|
||||||
@@ -972,3 +979,200 @@ void SherpaOnnxDestroySpokenLanguageIdentificationResult(
|
|||||||
delete r;
|
delete r;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct SherpaOnnxSpeakerEmbeddingExtractor {
|
||||||
|
std::unique_ptr<sherpa_onnx::SpeakerEmbeddingExtractor> impl;
|
||||||
|
};
|
||||||
|
|
||||||
|
const SherpaOnnxSpeakerEmbeddingExtractor *
|
||||||
|
SherpaOnnxCreateSpeakerEmbeddingExtractor(
|
||||||
|
const SherpaOnnxSpeakerEmbeddingExtractorConfig *config) {
|
||||||
|
sherpa_onnx::SpeakerEmbeddingExtractorConfig c;
|
||||||
|
c.model = SHERPA_ONNX_OR(config->model, "");
|
||||||
|
|
||||||
|
c.num_threads = SHERPA_ONNX_OR(config->num_threads, 1);
|
||||||
|
c.debug = SHERPA_ONNX_OR(config->debug, 0);
|
||||||
|
c.provider = SHERPA_ONNX_OR(config->provider, "cpu");
|
||||||
|
|
||||||
|
if (config->debug) {
|
||||||
|
SHERPA_ONNX_LOGE("%s\n", c.ToString().c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!c.Validate()) {
|
||||||
|
SHERPA_ONNX_LOGE("Errors in config!");
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto p = new SherpaOnnxSpeakerEmbeddingExtractor;
|
||||||
|
|
||||||
|
p->impl = std::make_unique<sherpa_onnx::SpeakerEmbeddingExtractor>(c);
|
||||||
|
|
||||||
|
return p;
|
||||||
|
}
|
||||||
|
|
||||||
|
void SherpaOnnxDestroySpeakerEmbeddingExtractor(
|
||||||
|
const SherpaOnnxSpeakerEmbeddingExtractor *p) {
|
||||||
|
delete p;
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t SherpaOnnxSpeakerEmbeddingExtractorDim(
|
||||||
|
const SherpaOnnxSpeakerEmbeddingExtractor *p) {
|
||||||
|
return p->impl->Dim();
|
||||||
|
}
|
||||||
|
|
||||||
|
const SherpaOnnxOnlineStream *SherpaOnnxSpeakerEmbeddingExtractorCreateStream(
|
||||||
|
const SherpaOnnxSpeakerEmbeddingExtractor *p) {
|
||||||
|
SherpaOnnxOnlineStream *stream =
|
||||||
|
new SherpaOnnxOnlineStream(p->impl->CreateStream());
|
||||||
|
return stream;
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t SherpaOnnxSpeakerEmbeddingExtractorIsReady(
|
||||||
|
const SherpaOnnxSpeakerEmbeddingExtractor *p,
|
||||||
|
const SherpaOnnxOnlineStream *s) {
|
||||||
|
return p->impl->IsReady(s->impl.get());
|
||||||
|
}
|
||||||
|
|
||||||
|
const float *SherpaOnnxSpeakerEmbeddingExtractorComputeEmbedding(
|
||||||
|
const SherpaOnnxSpeakerEmbeddingExtractor *p,
|
||||||
|
const SherpaOnnxOnlineStream *s) {
|
||||||
|
std::vector<float> v = p->impl->Compute(s->impl.get());
|
||||||
|
float *ans = new float[v.size()];
|
||||||
|
std::copy(v.begin(), v.end(), ans);
|
||||||
|
return ans;
|
||||||
|
}
|
||||||
|
|
||||||
|
void SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(const float *v) {
|
||||||
|
delete[] v;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct SherpaOnnxSpeakerEmbeddingManager {
|
||||||
|
std::unique_ptr<sherpa_onnx::SpeakerEmbeddingManager> impl;
|
||||||
|
};
|
||||||
|
|
||||||
|
const SherpaOnnxSpeakerEmbeddingManager *
|
||||||
|
SherpaOnnxCreateSpeakerEmbeddingManager(int32_t dim) {
|
||||||
|
auto p = new SherpaOnnxSpeakerEmbeddingManager;
|
||||||
|
p->impl = std::make_unique<sherpa_onnx::SpeakerEmbeddingManager>(dim);
|
||||||
|
return p;
|
||||||
|
}
|
||||||
|
|
||||||
|
void SherpaOnnxDestroySpeakerEmbeddingManager(
|
||||||
|
const SherpaOnnxSpeakerEmbeddingManager *p) {
|
||||||
|
delete p;
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t SherpaOnnxSpeakerEmbeddingManagerAdd(
|
||||||
|
const SherpaOnnxSpeakerEmbeddingManager *p, const char *name,
|
||||||
|
const float *v) {
|
||||||
|
return p->impl->Add(name, v);
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t SherpaOnnxSpeakerEmbeddingManagerAddList(
|
||||||
|
const SherpaOnnxSpeakerEmbeddingManager *p, const char *name,
|
||||||
|
const float **v) {
|
||||||
|
int32_t n = 0;
|
||||||
|
auto q = v;
|
||||||
|
while (q && q[0]) {
|
||||||
|
++n;
|
||||||
|
++q;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (n == 0) {
|
||||||
|
SHERPA_ONNX_LOGE("Empty embedding!");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::vector<float>> vec(n);
|
||||||
|
int32_t dim = p->impl->Dim();
|
||||||
|
|
||||||
|
for (int32_t i = 0; i != n; ++i) {
|
||||||
|
vec[i] = std::vector<float>(v[i], v[i] + dim);
|
||||||
|
}
|
||||||
|
|
||||||
|
return p->impl->Add(name, vec);
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t SherpaOnnxSpeakerEmbeddingManagerAddListFlattened(
|
||||||
|
const SherpaOnnxSpeakerEmbeddingManager *p, const char *name,
|
||||||
|
const float *v, int32_t n) {
|
||||||
|
std::vector<std::vector<float>> vec(n);
|
||||||
|
|
||||||
|
int32_t dim = p->impl->Dim();
|
||||||
|
|
||||||
|
for (int32_t i = 0; i != n; ++i, v += dim) {
|
||||||
|
vec[i] = std::vector<float>(v, v + dim);
|
||||||
|
}
|
||||||
|
|
||||||
|
return p->impl->Add(name, vec);
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t SherpaOnnxSpeakerEmbeddingManagerRemove(
|
||||||
|
const SherpaOnnxSpeakerEmbeddingManager *p, const char *name) {
|
||||||
|
return p->impl->Remove(name);
|
||||||
|
}
|
||||||
|
|
||||||
|
const char *SherpaOnnxSpeakerEmbeddingManagerSearch(
|
||||||
|
const SherpaOnnxSpeakerEmbeddingManager *p, const float *v,
|
||||||
|
float threshold) {
|
||||||
|
auto r = p->impl->Search(v, threshold);
|
||||||
|
if (r.empty()) {
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
char *name = new char[r.size() + 1];
|
||||||
|
std::copy(r.begin(), r.end(), name);
|
||||||
|
name[r.size()] = '\0';
|
||||||
|
|
||||||
|
return name;
|
||||||
|
}
|
||||||
|
|
||||||
|
void SherpaOnnxSpeakerEmbeddingManagerFreeSearch(const char *name) {
|
||||||
|
delete[] name;
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t SherpaOnnxSpeakerEmbeddingManagerVerify(
|
||||||
|
const SherpaOnnxSpeakerEmbeddingManager *p, const char *name,
|
||||||
|
const float *v, float threshold) {
|
||||||
|
return p->impl->Verify(name, v, threshold);
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t SherpaOnnxSpeakerEmbeddingManagerContains(
|
||||||
|
const SherpaOnnxSpeakerEmbeddingManager *p, const char *name) {
|
||||||
|
return p->impl->Contains(name);
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t SherpaOnnxSpeakerEmbeddingManagerNumSpeakers(
|
||||||
|
const SherpaOnnxSpeakerEmbeddingManager *p) {
|
||||||
|
return p->impl->NumSpeakers();
|
||||||
|
}
|
||||||
|
|
||||||
|
const char *const *SherpaOnnxSpeakerEmbeddingManagerGetAllSpeakers(
|
||||||
|
const SherpaOnnxSpeakerEmbeddingManager *manager) {
|
||||||
|
std::vector<std::string> all_speakers = manager->impl->GetAllSpeakers();
|
||||||
|
int32_t num_speakers = all_speakers.size();
|
||||||
|
char **p = new char *[num_speakers + 1];
|
||||||
|
p[num_speakers] = nullptr;
|
||||||
|
|
||||||
|
int32_t i = 0;
|
||||||
|
for (const auto &name : all_speakers) {
|
||||||
|
p[i] = new char[name.size() + 1];
|
||||||
|
std::copy(name.begin(), name.end(), p[i]);
|
||||||
|
p[i][name.size()] = '\0';
|
||||||
|
|
||||||
|
i += 1;
|
||||||
|
}
|
||||||
|
return p;
|
||||||
|
}
|
||||||
|
|
||||||
|
void SherpaOnnxSpeakerEmbeddingManagerFreeAllSpeakers(
|
||||||
|
const char *const *names) {
|
||||||
|
auto p = names;
|
||||||
|
|
||||||
|
while (p && p[0]) {
|
||||||
|
delete[] p[0];
|
||||||
|
++p;
|
||||||
|
}
|
||||||
|
|
||||||
|
delete[] names;
|
||||||
|
}
|
||||||
|
|||||||
@@ -186,7 +186,7 @@ SHERPA_ONNX_API SherpaOnnxOnlineRecognizer *CreateOnlineRecognizer(
|
|||||||
///
|
///
|
||||||
/// @param p A pointer returned by CreateOnlineRecognizer()
|
/// @param p A pointer returned by CreateOnlineRecognizer()
|
||||||
SHERPA_ONNX_API void DestroyOnlineRecognizer(
|
SHERPA_ONNX_API void DestroyOnlineRecognizer(
|
||||||
SherpaOnnxOnlineRecognizer *recognizer);
|
const SherpaOnnxOnlineRecognizer *recognizer);
|
||||||
|
|
||||||
/// Create an online stream for accepting wave samples.
|
/// Create an online stream for accepting wave samples.
|
||||||
///
|
///
|
||||||
@@ -208,7 +208,7 @@ SHERPA_ONNX_API SherpaOnnxOnlineStream *CreateOnlineStreamWithHotwords(
|
|||||||
/// Destroy an online stream.
|
/// Destroy an online stream.
|
||||||
///
|
///
|
||||||
/// @param stream A pointer returned by CreateOnlineStream()
|
/// @param stream A pointer returned by CreateOnlineStream()
|
||||||
SHERPA_ONNX_API void DestroyOnlineStream(SherpaOnnxOnlineStream *stream);
|
SHERPA_ONNX_API void DestroyOnlineStream(const SherpaOnnxOnlineStream *stream);
|
||||||
|
|
||||||
/// Accept input audio samples and compute the features.
|
/// Accept input audio samples and compute the features.
|
||||||
/// The user has to invoke DecodeOnlineStream() to run the neural network and
|
/// The user has to invoke DecodeOnlineStream() to run the neural network and
|
||||||
@@ -221,7 +221,7 @@ SHERPA_ONNX_API void DestroyOnlineStream(SherpaOnnxOnlineStream *stream);
|
|||||||
/// @param samples A pointer to a 1-D array containing audio samples.
|
/// @param samples A pointer to a 1-D array containing audio samples.
|
||||||
/// The range of samples has to be normalized to [-1, 1].
|
/// The range of samples has to be normalized to [-1, 1].
|
||||||
/// @param n Number of elements in the samples array.
|
/// @param n Number of elements in the samples array.
|
||||||
SHERPA_ONNX_API void AcceptWaveform(SherpaOnnxOnlineStream *stream,
|
SHERPA_ONNX_API void AcceptWaveform(const SherpaOnnxOnlineStream *stream,
|
||||||
int32_t sample_rate, const float *samples,
|
int32_t sample_rate, const float *samples,
|
||||||
int32_t n);
|
int32_t n);
|
||||||
|
|
||||||
@@ -230,8 +230,9 @@ SHERPA_ONNX_API void AcceptWaveform(SherpaOnnxOnlineStream *stream,
|
|||||||
///
|
///
|
||||||
/// @param recognizer A pointer returned by CreateOnlineRecognizer
|
/// @param recognizer A pointer returned by CreateOnlineRecognizer
|
||||||
/// @param stream A pointer returned by CreateOnlineStream
|
/// @param stream A pointer returned by CreateOnlineStream
|
||||||
SHERPA_ONNX_API int32_t IsOnlineStreamReady(
|
SHERPA_ONNX_API int32_t
|
||||||
SherpaOnnxOnlineRecognizer *recognizer, SherpaOnnxOnlineStream *stream);
|
IsOnlineStreamReady(const SherpaOnnxOnlineRecognizer *recognizer,
|
||||||
|
const SherpaOnnxOnlineStream *stream);
|
||||||
|
|
||||||
/// Call this function to run the neural network model and decoding.
|
/// Call this function to run the neural network model and decoding.
|
||||||
//
|
//
|
||||||
@@ -243,8 +244,9 @@ SHERPA_ONNX_API int32_t IsOnlineStreamReady(
|
|||||||
/// DecodeOnlineStream(recognizer, stream);
|
/// DecodeOnlineStream(recognizer, stream);
|
||||||
/// }
|
/// }
|
||||||
///
|
///
|
||||||
SHERPA_ONNX_API void DecodeOnlineStream(SherpaOnnxOnlineRecognizer *recognizer,
|
SHERPA_ONNX_API void DecodeOnlineStream(
|
||||||
SherpaOnnxOnlineStream *stream);
|
const SherpaOnnxOnlineRecognizer *recognizer,
|
||||||
|
const SherpaOnnxOnlineStream *stream);
|
||||||
|
|
||||||
/// This function is similar to DecodeOnlineStream(). It decodes multiple
|
/// This function is similar to DecodeOnlineStream(). It decodes multiple
|
||||||
/// OnlineStream in parallel.
|
/// OnlineStream in parallel.
|
||||||
@@ -257,8 +259,8 @@ SHERPA_ONNX_API void DecodeOnlineStream(SherpaOnnxOnlineRecognizer *recognizer,
|
|||||||
/// CreateOnlineRecognizer()
|
/// CreateOnlineRecognizer()
|
||||||
/// @param n Number of elements in the given streams array.
|
/// @param n Number of elements in the given streams array.
|
||||||
SHERPA_ONNX_API void DecodeMultipleOnlineStreams(
|
SHERPA_ONNX_API void DecodeMultipleOnlineStreams(
|
||||||
SherpaOnnxOnlineRecognizer *recognizer, SherpaOnnxOnlineStream **streams,
|
const SherpaOnnxOnlineRecognizer *recognizer,
|
||||||
int32_t n);
|
const SherpaOnnxOnlineStream **streams, int32_t n);
|
||||||
|
|
||||||
/// Get the decoding results so far for an OnlineStream.
|
/// Get the decoding results so far for an OnlineStream.
|
||||||
///
|
///
|
||||||
@@ -268,7 +270,8 @@ SHERPA_ONNX_API void DecodeMultipleOnlineStreams(
|
|||||||
/// DestroyOnlineRecognizerResult() to free the returned pointer to
|
/// DestroyOnlineRecognizerResult() to free the returned pointer to
|
||||||
/// avoid memory leak.
|
/// avoid memory leak.
|
||||||
SHERPA_ONNX_API const SherpaOnnxOnlineRecognizerResult *GetOnlineStreamResult(
|
SHERPA_ONNX_API const SherpaOnnxOnlineRecognizerResult *GetOnlineStreamResult(
|
||||||
SherpaOnnxOnlineRecognizer *recognizer, SherpaOnnxOnlineStream *stream);
|
const SherpaOnnxOnlineRecognizer *recognizer,
|
||||||
|
const SherpaOnnxOnlineStream *stream);
|
||||||
|
|
||||||
/// Destroy the pointer returned by GetOnlineStreamResult().
|
/// Destroy the pointer returned by GetOnlineStreamResult().
|
||||||
///
|
///
|
||||||
@@ -281,35 +284,36 @@ SHERPA_ONNX_API void DestroyOnlineRecognizerResult(
|
|||||||
///
|
///
|
||||||
/// @param recognizer A pointer returned by CreateOnlineRecognizer().
|
/// @param recognizer A pointer returned by CreateOnlineRecognizer().
|
||||||
/// @param stream A pointer returned by CreateOnlineStream
|
/// @param stream A pointer returned by CreateOnlineStream
|
||||||
SHERPA_ONNX_API void Reset(SherpaOnnxOnlineRecognizer *recognizer,
|
SHERPA_ONNX_API void Reset(const SherpaOnnxOnlineRecognizer *recognizer,
|
||||||
SherpaOnnxOnlineStream *stream);
|
const SherpaOnnxOnlineStream *stream);
|
||||||
|
|
||||||
/// Signal that no more audio samples would be available.
|
/// Signal that no more audio samples would be available.
|
||||||
/// After this call, you cannot call AcceptWaveform() any more.
|
/// After this call, you cannot call AcceptWaveform() any more.
|
||||||
///
|
///
|
||||||
/// @param stream A pointer returned by CreateOnlineStream()
|
/// @param stream A pointer returned by CreateOnlineStream()
|
||||||
SHERPA_ONNX_API void InputFinished(SherpaOnnxOnlineStream *stream);
|
SHERPA_ONNX_API void InputFinished(const SherpaOnnxOnlineStream *stream);
|
||||||
|
|
||||||
/// Return 1 if an endpoint has been detected.
|
/// Return 1 if an endpoint has been detected.
|
||||||
///
|
///
|
||||||
/// @param recognizer A pointer returned by CreateOnlineRecognizer()
|
/// @param recognizer A pointer returned by CreateOnlineRecognizer()
|
||||||
/// @param stream A pointer returned by CreateOnlineStream()
|
/// @param stream A pointer returned by CreateOnlineStream()
|
||||||
/// @return Return 1 if an endpoint is detected. Return 0 otherwise.
|
/// @return Return 1 if an endpoint is detected. Return 0 otherwise.
|
||||||
SHERPA_ONNX_API int32_t IsEndpoint(SherpaOnnxOnlineRecognizer *recognizer,
|
SHERPA_ONNX_API int32_t IsEndpoint(const SherpaOnnxOnlineRecognizer *recognizer,
|
||||||
SherpaOnnxOnlineStream *stream);
|
const SherpaOnnxOnlineStream *stream);
|
||||||
|
|
||||||
// for displaying results on Linux/macOS.
|
// for displaying results on Linux/macOS.
|
||||||
SHERPA_ONNX_API typedef struct SherpaOnnxDisplay SherpaOnnxDisplay;
|
SHERPA_ONNX_API typedef struct SherpaOnnxDisplay SherpaOnnxDisplay;
|
||||||
|
|
||||||
/// Create a display object. Must be freed using DestroyDisplay to avoid
|
/// Create a display object. Must be freed using DestroyDisplay to avoid
|
||||||
/// memory leak.
|
/// memory leak.
|
||||||
SHERPA_ONNX_API SherpaOnnxDisplay *CreateDisplay(int32_t max_word_per_line);
|
SHERPA_ONNX_API const SherpaOnnxDisplay *CreateDisplay(
|
||||||
|
int32_t max_word_per_line);
|
||||||
|
|
||||||
SHERPA_ONNX_API void DestroyDisplay(SherpaOnnxDisplay *display);
|
SHERPA_ONNX_API void DestroyDisplay(const SherpaOnnxDisplay *display);
|
||||||
|
|
||||||
/// Print the result.
|
/// Print the result.
|
||||||
SHERPA_ONNX_API void SherpaOnnxPrint(SherpaOnnxDisplay *display, int32_t idx,
|
SHERPA_ONNX_API void SherpaOnnxPrint(const SherpaOnnxDisplay *display,
|
||||||
const char *s);
|
int32_t idx, const char *s);
|
||||||
// ============================================================
|
// ============================================================
|
||||||
// For offline ASR (i.e., non-streaming ASR)
|
// For offline ASR (i.e., non-streaming ASR)
|
||||||
// ============================================================
|
// ============================================================
|
||||||
@@ -769,7 +773,7 @@ typedef void (*SherpaOnnxGeneratedAudioCallbackWithArg)(const float *samples,
|
|||||||
int32_t n, void *arg);
|
int32_t n, void *arg);
|
||||||
|
|
||||||
typedef void (*SherpaOnnxGeneratedAudioProgressCallback)(const float *samples,
|
typedef void (*SherpaOnnxGeneratedAudioProgressCallback)(const float *samples,
|
||||||
int32_t n, float p);
|
int32_t n, float p);
|
||||||
|
|
||||||
SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTts SherpaOnnxOfflineTts;
|
SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTts SherpaOnnxOfflineTts;
|
||||||
|
|
||||||
@@ -839,7 +843,9 @@ SHERPA_ONNX_API const SherpaOnnxWave *SherpaOnnxReadWave(const char *filename);
|
|||||||
|
|
||||||
SHERPA_ONNX_API void SherpaOnnxFreeWave(const SherpaOnnxWave *wave);
|
SHERPA_ONNX_API void SherpaOnnxFreeWave(const SherpaOnnxWave *wave);
|
||||||
|
|
||||||
// Spoken language identification
|
// ============================================================
|
||||||
|
// For spoken language identification
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
SHERPA_ONNX_API typedef struct
|
SHERPA_ONNX_API typedef struct
|
||||||
SherpaOnnxSpokenLanguageIdentificationWhisperConfig {
|
SherpaOnnxSpokenLanguageIdentificationWhisperConfig {
|
||||||
@@ -893,6 +899,169 @@ SherpaOnnxSpokenLanguageIdentificationCompute(
|
|||||||
SHERPA_ONNX_API void SherpaOnnxDestroySpokenLanguageIdentificationResult(
|
SHERPA_ONNX_API void SherpaOnnxDestroySpokenLanguageIdentificationResult(
|
||||||
const SherpaOnnxSpokenLanguageIdentificationResult *r);
|
const SherpaOnnxSpokenLanguageIdentificationResult *r);
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// For speaker embedding extraction
|
||||||
|
// ============================================================
|
||||||
|
SHERPA_ONNX_API typedef struct SherpaOnnxSpeakerEmbeddingExtractorConfig {
|
||||||
|
const char *model;
|
||||||
|
int32_t num_threads;
|
||||||
|
int32_t debug;
|
||||||
|
const char *provider;
|
||||||
|
} SherpaOnnxSpeakerEmbeddingExtractorConfig;
|
||||||
|
|
||||||
|
SHERPA_ONNX_API typedef struct SherpaOnnxSpeakerEmbeddingExtractor
|
||||||
|
SherpaOnnxSpeakerEmbeddingExtractor;
|
||||||
|
|
||||||
|
// The user has to invoke SherpaOnnxDestroySpeakerEmbeddingExtractor()
|
||||||
|
// to free the returned pointer to avoid memory leak
|
||||||
|
SHERPA_ONNX_API const SherpaOnnxSpeakerEmbeddingExtractor *
|
||||||
|
SherpaOnnxCreateSpeakerEmbeddingExtractor(
|
||||||
|
const SherpaOnnxSpeakerEmbeddingExtractorConfig *config);
|
||||||
|
|
||||||
|
SHERPA_ONNX_API void SherpaOnnxDestroySpeakerEmbeddingExtractor(
|
||||||
|
const SherpaOnnxSpeakerEmbeddingExtractor *p);
|
||||||
|
|
||||||
|
SHERPA_ONNX_API int32_t SherpaOnnxSpeakerEmbeddingExtractorDim(
|
||||||
|
const SherpaOnnxSpeakerEmbeddingExtractor *p);
|
||||||
|
|
||||||
|
// The user has to invoke DestroyOnlineStream() to free the returned pointer
|
||||||
|
// to avoid memory leak
|
||||||
|
SHERPA_ONNX_API const SherpaOnnxOnlineStream *
|
||||||
|
SherpaOnnxSpeakerEmbeddingExtractorCreateStream(
|
||||||
|
const SherpaOnnxSpeakerEmbeddingExtractor *p);
|
||||||
|
|
||||||
|
// Return 1 if the stream has enough feature frames for computing embeddings.
|
||||||
|
// Return 0 otherwise.
|
||||||
|
SHERPA_ONNX_API int32_t SherpaOnnxSpeakerEmbeddingExtractorIsReady(
|
||||||
|
const SherpaOnnxSpeakerEmbeddingExtractor *p,
|
||||||
|
const SherpaOnnxOnlineStream *s);
|
||||||
|
|
||||||
|
// Compute the embedding of the stream.
|
||||||
|
//
|
||||||
|
// @return Return a pointer pointing to an array containing the embedding.
|
||||||
|
// The length of the array is `dim` as returned by
|
||||||
|
// SherpaOnnxSpeakerEmbeddingExtractorDim(p)
|
||||||
|
//
|
||||||
|
// The user has to invoke SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding()
|
||||||
|
// to free the returned pointer to avoid memory leak.
|
||||||
|
SHERPA_ONNX_API const float *
|
||||||
|
SherpaOnnxSpeakerEmbeddingExtractorComputeEmbedding(
|
||||||
|
const SherpaOnnxSpeakerEmbeddingExtractor *p,
|
||||||
|
const SherpaOnnxOnlineStream *s);
|
||||||
|
|
||||||
|
SHERPA_ONNX_API void SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(
|
||||||
|
const float *v);
|
||||||
|
|
||||||
|
SHERPA_ONNX_API typedef struct SherpaOnnxSpeakerEmbeddingManager
|
||||||
|
SherpaOnnxSpeakerEmbeddingManager;
|
||||||
|
|
||||||
|
// The user has to invoke SherpaOnnxDestroySpeakerEmbeddingManager()
|
||||||
|
// to free the returned pointer to avoid memory leak
|
||||||
|
SHERPA_ONNX_API const SherpaOnnxSpeakerEmbeddingManager *
|
||||||
|
SherpaOnnxCreateSpeakerEmbeddingManager(int32_t dim);
|
||||||
|
|
||||||
|
SHERPA_ONNX_API void SherpaOnnxDestroySpeakerEmbeddingManager(
|
||||||
|
const SherpaOnnxSpeakerEmbeddingManager *p);
|
||||||
|
|
||||||
|
// Register the embedding of a user
|
||||||
|
//
|
||||||
|
// @param name The name of the user
|
||||||
|
// @param p Pointer to an array containing the embeddings. The length of the
|
||||||
|
// array must be equal to `dim` used to construct the manager `p`.
|
||||||
|
//
|
||||||
|
// @return Return 1 if added successfully. Return 0 on error
|
||||||
|
SHERPA_ONNX_API int32_t
|
||||||
|
SherpaOnnxSpeakerEmbeddingManagerAdd(const SherpaOnnxSpeakerEmbeddingManager *p,
|
||||||
|
const char *name, const float *v);
|
||||||
|
|
||||||
|
// @param v Pointer to an array of embeddings. If there are n embeddings, then
|
||||||
|
// v[0] is the pointer to the 0-th array containing the embeddings
|
||||||
|
// v[1] is the pointer to the 1-st array containing the embeddings
|
||||||
|
// v[n-1] is the pointer to the last array containing the embeddings
|
||||||
|
// v[n] is a NULL pointer
|
||||||
|
// @return Return 1 if added successfully. Return 0 on error
|
||||||
|
SHERPA_ONNX_API int32_t SherpaOnnxSpeakerEmbeddingManagerAddList(
|
||||||
|
const SherpaOnnxSpeakerEmbeddingManager *p, const char *name,
|
||||||
|
const float **v);
|
||||||
|
|
||||||
|
// Similar to SherpaOnnxSpeakerEmbeddingManagerAddList() but the memory
|
||||||
|
// is flattened.
|
||||||
|
//
|
||||||
|
// The length of the input array should be `n * dim`.
|
||||||
|
//
|
||||||
|
// @return Return 1 if added successfully. Return 0 on error
|
||||||
|
SHERPA_ONNX_API int32_t SherpaOnnxSpeakerEmbeddingManagerAddListFlattened(
|
||||||
|
const SherpaOnnxSpeakerEmbeddingManager *p, const char *name,
|
||||||
|
const float *v, int32_t n);
|
||||||
|
|
||||||
|
// Remove a user.
|
||||||
|
// @param naem The name of the user to remove.
|
||||||
|
// @return Return 1 if removed successfully; return 0 on error.
|
||||||
|
//
|
||||||
|
// Note if the user does not exist, it also returns 0.
|
||||||
|
SHERPA_ONNX_API int32_t SherpaOnnxSpeakerEmbeddingManagerRemove(
|
||||||
|
const SherpaOnnxSpeakerEmbeddingManager *p, const char *name);
|
||||||
|
|
||||||
|
// Search if an existing users' embedding matches the given one.
|
||||||
|
//
|
||||||
|
// @param p Pointer to an array containing the embedding. The dim
|
||||||
|
// of the array must equal to `dim` used to construct the manager `p`.
|
||||||
|
// @param threshold A value between 0 and 1. If the similarity score exceeds
|
||||||
|
// this threshold, we say a match is found.
|
||||||
|
// @return Returns the name of the user if found. Return NULL if not found.
|
||||||
|
// If not NULL, the caller has to invoke
|
||||||
|
// SherpaOnnxSpeakerEmbeddingManagerFreeSearch() to free the returned
|
||||||
|
// pointer to avoid memory leak.
|
||||||
|
SHERPA_ONNX_API const char *SherpaOnnxSpeakerEmbeddingManagerSearch(
|
||||||
|
const SherpaOnnxSpeakerEmbeddingManager *p, const float *v,
|
||||||
|
float threshold);
|
||||||
|
|
||||||
|
SHERPA_ONNX_API void SherpaOnnxSpeakerEmbeddingManagerFreeSearch(
|
||||||
|
const char *name);
|
||||||
|
|
||||||
|
// Check whether the input embedding matches the embedding of the input
|
||||||
|
// speaker.
|
||||||
|
//
|
||||||
|
// It is for speaker verification.
|
||||||
|
//
|
||||||
|
// @param name The target speaker name.
|
||||||
|
// @param p The input embedding to check.
|
||||||
|
// @param threshold A value between 0 and 1.
|
||||||
|
// @return Return 1 if it matches. Otherwise, it returns 0.
|
||||||
|
SHERPA_ONNX_API int32_t SherpaOnnxSpeakerEmbeddingManagerVerify(
|
||||||
|
const SherpaOnnxSpeakerEmbeddingManager *p, const char *name,
|
||||||
|
const float *v, float threshold);
|
||||||
|
|
||||||
|
// Return 1 if the user with the name is in the manager.
|
||||||
|
// Return 0 if the user does not exist.
|
||||||
|
SHERPA_ONNX_API int32_t SherpaOnnxSpeakerEmbeddingManagerContains(
|
||||||
|
const SherpaOnnxSpeakerEmbeddingManager *p, const char *name);
|
||||||
|
|
||||||
|
// Return number of speakers in the manager.
|
||||||
|
SHERPA_ONNX_API int32_t SherpaOnnxSpeakerEmbeddingManagerNumSpeakers(
|
||||||
|
const SherpaOnnxSpeakerEmbeddingManager *p);
|
||||||
|
|
||||||
|
// Return the name of all speakers in the manager.
|
||||||
|
//
|
||||||
|
// @return Return an array of pointers `ans`. If there are n speakers, then
|
||||||
|
// - ans[0] contains the name of the 0-th speaker
|
||||||
|
// - ans[1] contains the name of the 1-st speaker
|
||||||
|
// - ans[n-1] contains the name of the last speaker
|
||||||
|
// - ans[n] is NULL
|
||||||
|
// If there are no users at all, then ans[0] is NULL. In any case,
|
||||||
|
// `ans` is not NULL.
|
||||||
|
//
|
||||||
|
// Each name is NULL-terminated
|
||||||
|
//
|
||||||
|
// The caller has to invoke SherpaOnnxSpeakerEmbeddingManagerFreeAllSpeakers()
|
||||||
|
// to free the returned pointer to avoid memory leak.
|
||||||
|
SHERPA_ONNX_API const char *const *
|
||||||
|
SherpaOnnxSpeakerEmbeddingManagerGetAllSpeakers(
|
||||||
|
const SherpaOnnxSpeakerEmbeddingManager *p);
|
||||||
|
|
||||||
|
SHERPA_ONNX_API void SherpaOnnxSpeakerEmbeddingManagerFreeAllSpeakers(
|
||||||
|
const char *const *names);
|
||||||
|
|
||||||
#if defined(__GNUC__)
|
#if defined(__GNUC__)
|
||||||
#pragma GCC diagnostic pop
|
#pragma GCC diagnostic pop
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -168,7 +168,8 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl {
|
|||||||
ans.samples.insert(ans.samples.end(), audio.samples.begin(),
|
ans.samples.insert(ans.samples.end(), audio.samples.begin(),
|
||||||
audio.samples.end());
|
audio.samples.end());
|
||||||
if (callback) {
|
if (callback) {
|
||||||
callback(audio.samples.data(), audio.samples.size(), b * 1.0 / num_batches);
|
callback(audio.samples.data(), audio.samples.size(),
|
||||||
|
b * 1.0 / num_batches);
|
||||||
// Caution(fangjun): audio is freed when the callback returns, so users
|
// Caution(fangjun): audio is freed when the callback returns, so users
|
||||||
// should copy the data if they want to access the data after
|
// should copy the data if they want to access the data after
|
||||||
// the callback returns to avoid segmentation fault.
|
// the callback returns to avoid segmentation fault.
|
||||||
|
|||||||
@@ -54,8 +54,8 @@ struct GeneratedAudio {
|
|||||||
|
|
||||||
class OfflineTtsImpl;
|
class OfflineTtsImpl;
|
||||||
|
|
||||||
using GeneratedAudioCallback =
|
using GeneratedAudioCallback = std::function<void(
|
||||||
std::function<void(const float * /*samples*/, int32_t /*n*/, float /*progress*/)>;
|
const float * /*samples*/, int32_t /*n*/, float /*progress*/)>;
|
||||||
|
|
||||||
class OfflineTts {
|
class OfflineTts {
|
||||||
public:
|
public:
|
||||||
|
|||||||
@@ -44,7 +44,8 @@ static void Handler(int32_t /*sig*/) {
|
|||||||
fprintf(stderr, "\nCaught Ctrl + C. Exiting\n");
|
fprintf(stderr, "\nCaught Ctrl + C. Exiting\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
static void AudioGeneratedCallback(const float *s, int32_t n) {
|
static void AudioGeneratedCallback(const float *s, int32_t n,
|
||||||
|
float /*progress*/) {
|
||||||
if (n > 0) {
|
if (n > 0) {
|
||||||
std::lock_guard<std::mutex> lock(g_buffer.mutex);
|
std::lock_guard<std::mutex> lock(g_buffer.mutex);
|
||||||
g_buffer.samples.push({s, s + n});
|
g_buffer.samples.push({s, s + n});
|
||||||
|
|||||||
@@ -47,7 +47,8 @@ static void Handler(int32_t /*sig*/) {
|
|||||||
fprintf(stderr, "\nCaught Ctrl + C. Exiting\n");
|
fprintf(stderr, "\nCaught Ctrl + C. Exiting\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
static void AudioGeneratedCallback(const float *s, int32_t n, float /*progress*/) {
|
static void AudioGeneratedCallback(const float *s, int32_t n,
|
||||||
|
float /*progress*/) {
|
||||||
if (n > 0) {
|
if (n > 0) {
|
||||||
Samples samples;
|
Samples samples;
|
||||||
samples.data = std::vector<float>{s, s + n};
|
samples.data = std::vector<float>{s, s + n};
|
||||||
|
|||||||
@@ -9,9 +9,8 @@
|
|||||||
#include "sherpa-onnx/csrc/parse-options.h"
|
#include "sherpa-onnx/csrc/parse-options.h"
|
||||||
#include "sherpa-onnx/csrc/wave-writer.h"
|
#include "sherpa-onnx/csrc/wave-writer.h"
|
||||||
|
|
||||||
void audioCallback(const float *samples, int32_t n, float progress)
|
void audioCallback(const float *samples, int32_t n, float progress) {
|
||||||
{
|
printf("sample=%d, progress=%f\n", n, progress);
|
||||||
printf( "sample=%d, progress=%f\n", n, progress );
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int32_t argc, char *argv[]) {
|
int main(int32_t argc, char *argv[]) {
|
||||||
|
|||||||
@@ -93,7 +93,7 @@ class SpeakerEmbeddingManager::Impl {
|
|||||||
int32_t num_rows = embedding_matrix_.rows();
|
int32_t num_rows = embedding_matrix_.rows();
|
||||||
|
|
||||||
if (row_idx < num_rows - 1) {
|
if (row_idx < num_rows - 1) {
|
||||||
embedding_matrix_.block(row_idx, 0, num_rows - -1 - row_idx, dim_) =
|
embedding_matrix_.block(row_idx, 0, num_rows - 1 - row_idx, dim_) =
|
||||||
embedding_matrix_.bottomRows(num_rows - 1 - row_idx);
|
embedding_matrix_.bottomRows(num_rows - 1 - row_idx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -795,9 +795,10 @@ class SherpaOnnxOfflineTts {
|
|||||||
explicit SherpaOnnxOfflineTts(const OfflineTtsConfig &config)
|
explicit SherpaOnnxOfflineTts(const OfflineTtsConfig &config)
|
||||||
: tts_(config) {}
|
: tts_(config) {}
|
||||||
|
|
||||||
GeneratedAudio Generate(
|
GeneratedAudio Generate(const std::string &text, int64_t sid = 0,
|
||||||
const std::string &text, int64_t sid = 0, float speed = 1.0,
|
float speed = 1.0,
|
||||||
std::function<void(const float *, int32_t, float)> callback = nullptr) const {
|
std::function<void(const float *, int32_t, float)>
|
||||||
|
callback = nullptr) const {
|
||||||
return tts_.Generate(text, sid, speed, callback);
|
return tts_.Generate(text, sid, speed, callback);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -55,14 +55,16 @@ void PybindOfflineTts(py::module *m) {
|
|||||||
.def(
|
.def(
|
||||||
"generate",
|
"generate",
|
||||||
[](const PyClass &self, const std::string &text, int64_t sid,
|
[](const PyClass &self, const std::string &text, int64_t sid,
|
||||||
float speed, std::function<void(py::array_t<float>, float)> callback)
|
float speed,
|
||||||
|
std::function<void(py::array_t<float>, float)> callback)
|
||||||
-> GeneratedAudio {
|
-> GeneratedAudio {
|
||||||
if (!callback) {
|
if (!callback) {
|
||||||
return self.Generate(text, sid, speed);
|
return self.Generate(text, sid, speed);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::function<void(const float *, int32_t, float)> callback_wrapper =
|
std::function<void(const float *, int32_t, float)>
|
||||||
[callback](const float *samples, int32_t n, float progress) {
|
callback_wrapper = [callback](const float *samples, int32_t n,
|
||||||
|
float progress) {
|
||||||
// CAUTION(fangjun): we have to copy samples since it is
|
// CAUTION(fangjun): we have to copy samples since it is
|
||||||
// freed once the call back returns.
|
// freed once the call back returns.
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user