Add speaker diarization API for HarmonyOS. (#1609)
This commit is contained in:
@@ -1,11 +1,6 @@
|
||||
export {
|
||||
listRawfileDir,
|
||||
readWave,
|
||||
readWaveFromBinary,
|
||||
} from "libsherpa_onnx.so";
|
||||
export { listRawfileDir, readWave, readWaveFromBinary, } from "libsherpa_onnx.so";
|
||||
|
||||
export {
|
||||
CircularBuffer,
|
||||
export { CircularBuffer,
|
||||
SileroVadConfig,
|
||||
SpeechSegment,
|
||||
Vad,
|
||||
@@ -13,8 +8,7 @@ export {
|
||||
} from './src/main/ets/components/Vad';
|
||||
|
||||
|
||||
export {
|
||||
Samples,
|
||||
export { Samples,
|
||||
OfflineStream,
|
||||
FeatureConfig,
|
||||
OfflineTransducerModelConfig,
|
||||
@@ -31,8 +25,7 @@ export {
|
||||
OfflineRecognizer,
|
||||
} from './src/main/ets/components/NonStreamingAsr';
|
||||
|
||||
export {
|
||||
OnlineStream,
|
||||
export { OnlineStream,
|
||||
OnlineTransducerModelConfig,
|
||||
OnlineParaformerModelConfig,
|
||||
OnlineZipformer2CtcModelConfig,
|
||||
@@ -43,8 +36,7 @@ export {
|
||||
OnlineRecognizer,
|
||||
} from './src/main/ets/components/StreamingAsr';
|
||||
|
||||
export {
|
||||
OfflineTtsVitsModelConfig,
|
||||
export { OfflineTtsVitsModelConfig,
|
||||
OfflineTtsModelConfig,
|
||||
OfflineTtsConfig,
|
||||
OfflineTts,
|
||||
@@ -52,8 +44,15 @@ export {
|
||||
TtsInput,
|
||||
} from './src/main/ets/components/NonStreamingTts';
|
||||
|
||||
export {
|
||||
SpeakerEmbeddingExtractorConfig,
|
||||
export { SpeakerEmbeddingExtractorConfig,
|
||||
SpeakerEmbeddingExtractor,
|
||||
SpeakerEmbeddingManager,
|
||||
} from './src/main/ets/components/SpeakerIdentification';
|
||||
|
||||
export { OfflineSpeakerSegmentationPyannoteModelConfig,
|
||||
OfflineSpeakerSegmentationModelConfig,
|
||||
OfflineSpeakerDiarizationConfig,
|
||||
OfflineSpeakerDiarizationSegment,
|
||||
OfflineSpeakerDiarization,
|
||||
FastClusteringConfig,
|
||||
} from './src/main/ets/components/NonStreamingSpeakerDiarization';
|
||||
|
||||
@@ -101,6 +101,17 @@ static SherpaOnnxFastClusteringConfig GetFastClusteringConfig(
|
||||
static Napi::External<SherpaOnnxOfflineSpeakerDiarization>
|
||||
CreateOfflineSpeakerDiarizationWrapper(const Napi::CallbackInfo &info) {
|
||||
Napi::Env env = info.Env();
|
||||
|
||||
#if __OHOS__
|
||||
if (info.Length() != 2) {
|
||||
std::ostringstream os;
|
||||
os << "Expect only 2 arguments. Given: " << info.Length();
|
||||
|
||||
Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException();
|
||||
|
||||
return {};
|
||||
}
|
||||
#else
|
||||
if (info.Length() != 1) {
|
||||
std::ostringstream os;
|
||||
os << "Expect only 1 argument. Given: " << info.Length();
|
||||
@@ -109,6 +120,7 @@ CreateOfflineSpeakerDiarizationWrapper(const Napi::CallbackInfo &info) {
|
||||
|
||||
return {};
|
||||
}
|
||||
#endif
|
||||
|
||||
if (!info[0].IsObject()) {
|
||||
Napi::TypeError::New(env, "Expect an object as the argument")
|
||||
@@ -129,8 +141,18 @@ CreateOfflineSpeakerDiarizationWrapper(const Napi::CallbackInfo &info) {
|
||||
SHERPA_ONNX_ASSIGN_ATTR_FLOAT(min_duration_on, minDurationOn);
|
||||
SHERPA_ONNX_ASSIGN_ATTR_FLOAT(min_duration_off, minDurationOff);
|
||||
|
||||
#if __OHOS__
|
||||
std::unique_ptr<NativeResourceManager,
|
||||
decltype(&OH_ResourceManager_ReleaseNativeResourceManager)>
|
||||
mgr(OH_ResourceManager_InitNativeResourceManager(env, info[1]),
|
||||
&OH_ResourceManager_ReleaseNativeResourceManager);
|
||||
|
||||
const SherpaOnnxOfflineSpeakerDiarization *sd =
|
||||
SherpaOnnxCreateOfflineSpeakerDiarizationOHOS(&c, mgr.get());
|
||||
#else
|
||||
const SherpaOnnxOfflineSpeakerDiarization *sd =
|
||||
SherpaOnnxCreateOfflineSpeakerDiarization(&c);
|
||||
#endif
|
||||
|
||||
if (c.segmentation.pyannote.model) {
|
||||
delete[] c.segmentation.pyannote.model;
|
||||
@@ -224,9 +246,17 @@ static Napi::Array OfflineSpeakerDiarizationProcessWrapper(
|
||||
|
||||
Napi::Float32Array samples = info[1].As<Napi::Float32Array>();
|
||||
|
||||
#if __OHOS__
|
||||
// Note(fangjun): For unknown reasons on HarmonyOS, we need to divide it by
|
||||
// sizeof(float) here
|
||||
const SherpaOnnxOfflineSpeakerDiarizationResult *r =
|
||||
SherpaOnnxOfflineSpeakerDiarizationProcess(
|
||||
sd, samples.Data(), samples.ElementLength() / sizeof(float));
|
||||
#else
|
||||
const SherpaOnnxOfflineSpeakerDiarizationResult *r =
|
||||
SherpaOnnxOfflineSpeakerDiarizationProcess(sd, samples.Data(),
|
||||
samples.ElementLength());
|
||||
#endif
|
||||
|
||||
int32_t num_segments =
|
||||
SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments(r);
|
||||
|
||||
@@ -62,3 +62,8 @@ export const speakerEmbeddingManagerVerify: (handle: object, obj: {name: string,
|
||||
export const speakerEmbeddingManagerContains: (handle: object, name: string) => boolean;
|
||||
export const speakerEmbeddingManagerNumSpeakers: (handle: object) => number;
|
||||
export const speakerEmbeddingManagerGetAllSpeakers: (handle: object) => Array<string>;
|
||||
|
||||
export const createOfflineSpeakerDiarization: (config: object, mgr?: object) => object;
|
||||
export const getOfflineSpeakerDiarizationSampleRate: (handle: object) => number;
|
||||
export const offlineSpeakerDiarizationProcess: (handle: object, samples: Float32Array) => object;
|
||||
export const offlineSpeakerDiarizationSetConfig: (handle: object, config: object) => void;
|
||||
|
||||
@@ -67,10 +67,15 @@ static Napi::Boolean WriteWaveWrapper(const Napi::CallbackInfo &info) {
|
||||
|
||||
Napi::Float32Array samples = obj.Get("samples").As<Napi::Float32Array>();
|
||||
int32_t sample_rate = obj.Get("sampleRate").As<Napi::Number>().Int32Value();
|
||||
|
||||
#if __OHOS__
|
||||
int32_t ok = SherpaOnnxWriteWave(
|
||||
samples.Data(), samples.ElementLength() / sizeof(float), sample_rate,
|
||||
info[0].As<Napi::String>().Utf8Value().c_str());
|
||||
#else
|
||||
int32_t ok =
|
||||
SherpaOnnxWriteWave(samples.Data(), samples.ElementLength(), sample_rate,
|
||||
info[0].As<Napi::String>().Utf8Value().c_str());
|
||||
#endif
|
||||
|
||||
return Napi::Boolean::New(env, ok);
|
||||
}
|
||||
|
||||
@@ -0,0 +1,73 @@
|
||||
import {
|
||||
createOfflineSpeakerDiarization,
|
||||
getOfflineSpeakerDiarizationSampleRate,
|
||||
offlineSpeakerDiarizationProcess,
|
||||
offlineSpeakerDiarizationSetConfig,
|
||||
} from 'libsherpa_onnx.so';
|
||||
|
||||
import { SpeakerEmbeddingExtractorConfig } from './SpeakerIdentification';
|
||||
|
||||
export class OfflineSpeakerSegmentationPyannoteModelConfig {
|
||||
public model: string = '';
|
||||
}
|
||||
|
||||
export class OfflineSpeakerSegmentationModelConfig {
|
||||
public pyannote: OfflineSpeakerSegmentationPyannoteModelConfig = new OfflineSpeakerSegmentationPyannoteModelConfig();
|
||||
public numThreads: number = 1;
|
||||
public debug: boolean = false;
|
||||
public provider: string = 'cpu';
|
||||
}
|
||||
|
||||
export class FastClusteringConfig {
|
||||
public numClusters: number = -1;
|
||||
public threshold: number = 0.5;
|
||||
}
|
||||
|
||||
export class OfflineSpeakerDiarizationConfig {
|
||||
public segmentation: OfflineSpeakerSegmentationModelConfig = new OfflineSpeakerSegmentationModelConfig();
|
||||
public embedding: SpeakerEmbeddingExtractorConfig = new SpeakerEmbeddingExtractorConfig();
|
||||
public clustering: FastClusteringConfig = new FastClusteringConfig();
|
||||
public minDurationOn: number = 0.2;
|
||||
public minDurationOff: number = 0.5;
|
||||
}
|
||||
|
||||
export class OfflineSpeakerDiarizationSegment {
|
||||
public start: number = 0; // in secondspublic end: number = 0; // in secondspublic speaker: number =
|
||||
0; // ID of the speaker; count from 0
|
||||
}
|
||||
|
||||
export class OfflineSpeakerDiarization {
|
||||
public config: OfflineSpeakerDiarizationConfig;
|
||||
public sampleRate: number;
|
||||
private handle: object;
|
||||
|
||||
constructor(config: OfflineSpeakerDiarizationConfig, mgr?: object) {
|
||||
this.handle = createOfflineSpeakerDiarization(config, mgr);
|
||||
this.config = config;
|
||||
|
||||
this.sampleRate = getOfflineSpeakerDiarizationSampleRate(this.handle);
|
||||
}
|
||||
|
||||
/**
|
||||
* samples is a 1-d float32 array. Each element of the array should be
|
||||
* in the range [-1, 1].
|
||||
*
|
||||
* We assume its sample rate equals to this.sampleRate.
|
||||
*
|
||||
* Returns an array of object, where an object is
|
||||
*
|
||||
* {
|
||||
* "start": start_time_in_seconds,
|
||||
* "end": end_time_in_seconds,
|
||||
* "speaker": an_integer,
|
||||
* }
|
||||
*/
|
||||
process(samples: Float32Array): OfflineSpeakerDiarizationSegment {
|
||||
return offlineSpeakerDiarizationProcess(this.handle, samples) as OfflineSpeakerDiarizationSegment;
|
||||
}
|
||||
|
||||
setConfig(config: OfflineSpeakerDiarizationConfig) {
|
||||
offlineSpeakerDiarizationSetConfig(this.handle, config);
|
||||
this.config.clustering = config.clustering;
|
||||
}
|
||||
}
|
||||
@@ -35,8 +35,7 @@ export class SpeakerEmbeddingExtractor {
|
||||
}
|
||||
|
||||
createStream(): OnlineStream {
|
||||
return new OnlineStream(
|
||||
speakerEmbeddingExtractorCreateStream(this.handle));
|
||||
return new OnlineStream(speakerEmbeddingExtractorCreateStream(this.handle));
|
||||
}
|
||||
|
||||
isReady(stream: OnlineStream): boolean {
|
||||
@@ -44,8 +43,7 @@ export class SpeakerEmbeddingExtractor {
|
||||
}
|
||||
|
||||
compute(stream: OnlineStream, enableExternalBuffer: boolean = true): Float32Array {
|
||||
return speakerEmbeddingExtractorComputeEmbedding(
|
||||
this.handle, stream.handle, enableExternalBuffer);
|
||||
return speakerEmbeddingExtractorComputeEmbedding(this.handle, stream.handle, enableExternalBuffer);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -106,9 +104,7 @@ export class SpeakerEmbeddingManager {
|
||||
|
||||
addMulti(speaker: SpeakerNameWithEmbeddingList): boolean {
|
||||
const c: SpeakerNameWithEmbeddingN = {
|
||||
name: speaker.name,
|
||||
vv: flatten(speaker.v),
|
||||
n: speaker.v.length,
|
||||
name: speaker.name, vv: flatten(speaker.v), n: speaker.v.length,
|
||||
};
|
||||
return speakerEmbeddingManagerAddListFlattened(this.handle, c);
|
||||
}
|
||||
|
||||
@@ -125,8 +125,7 @@ export class OnlineRecognizer {
|
||||
}
|
||||
|
||||
getResult(stream: OnlineStream): OnlineRecognizerResult {
|
||||
const jsonStr: string =
|
||||
getOnlineStreamResultAsJson(this.handle, stream.handle);
|
||||
const jsonStr: string = getOnlineStreamResultAsJson(this.handle, stream.handle);
|
||||
|
||||
let o = JSON.parse(jsonStr) as OnlineRecognizerResultJson;
|
||||
|
||||
|
||||
@@ -62,8 +62,7 @@ export class CircularBuffer {
|
||||
|
||||
// return a float32 array
|
||||
get(startIndex: number, n: number, enableExternalBuffer: boolean = true): Float32Array {
|
||||
return circularBufferGet(
|
||||
this.handle, startIndex, n, enableExternalBuffer);
|
||||
return circularBufferGet(this.handle, startIndex, n, enableExternalBuffer);
|
||||
}
|
||||
|
||||
pop(n: number) {
|
||||
@@ -93,8 +92,7 @@ export class Vad {
|
||||
private handle: object;
|
||||
|
||||
constructor(config: VadConfig, bufferSizeInSeconds?: number, mgr?: object) {
|
||||
this.handle =
|
||||
createVoiceActivityDetector(config, bufferSizeInSeconds, mgr);
|
||||
this.handle = createVoiceActivityDetector(config, bufferSizeInSeconds, mgr);
|
||||
this.config = config;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user