Configurable low_freq high_freq, dithering (#664)

This commit is contained in:
Karel Vesely
2024-03-22 14:41:44 +01:00
committed by GitHub
parent 2fc1201924
commit eaec4c83c2
10 changed files with 96 additions and 15 deletions

View File

@@ -21,6 +21,27 @@ struct FeatureExtractorConfig {
// Feature dimension
int32_t feature_dim = 80;
// minimal frequency for Mel-filterbank, in Hz
float low_freq = 20.0f;
// maximal frequency of Mel-filterbank
// in Hz; negative value is subtracted from Nyquist freq.:
// i.e. for sampling_rate 16000 / 2 - 400 = 7600Hz
//
// Please see
// https://github.com/lhotse-speech/lhotse/blob/master/lhotse/features/fbank.py#L27
// and
// https://github.com/k2-fsa/sherpa-onnx/issues/514
float high_freq = -400.0f;
// dithering constant, useful for signals with hard-zeroes in non-speech parts
// this prevents large negative values in log-mel filterbanks
//
// In k2, audio samples are in range [-1..+1], in kaldi the range was
// [-32k..+32k], so the value 0.00003 is equivalent to kaldi default 1.0
//
float dither = 0.0f; // dithering disabled by default
// Set internally by some models, e.g., paraformer sets it to false.
// This parameter is not exposed to users from the commandline
// If true, the feature extractor expects inputs to be normalized to
@@ -31,7 +52,6 @@ struct FeatureExtractorConfig {
bool snip_edges = false;
float frame_shift_ms = 10.0f; // in milliseconds.
float frame_length_ms = 25.0f; // in milliseconds.
int32_t low_freq = 20;
bool is_librosa = false;
bool remove_dc_offset = true; // Subtract mean of wave before FFT.
std::string window_type = "povey"; // e.g. Hamming window