// sherpa-onnx/csrc/offline-sense-voice-model-meta-data.h // // Copyright (c) 2024 Xiaomi Corporation #ifndef SHERPA_ONNX_CSRC_OFFLINE_SENSE_VOICE_MODEL_META_DATA_H_ #define SHERPA_ONNX_CSRC_OFFLINE_SENSE_VOICE_MODEL_META_DATA_H_ #include #include #include namespace sherpa_onnx { struct OfflineSenseVoiceModelMetaData { // ID for using inverse text normalization int32_t with_itn_id; // ID for not using inverse text normalization int32_t without_itn_id; int32_t window_size; // lfr_m int32_t window_shift; // lfr_n int32_t vocab_size; int32_t subsampling_factor = 1; // Usually 0 for SenseVoice models. // 0 means samples are scaled to [-32768, 32767] before are sent to the // feature extractor int32_t normalize_samples = 0; int32_t blank_id = 0; // possible values: // zh, en, ja, ko, yue, auto // where // zh is Chinese (Mandarin) // en is English // ja is Japanese // ko is Korean // yue is Cantonese // auto is to let the model recognize the language std::unordered_map lang2id; std::vector neg_mean; std::vector inv_stddev; }; } // namespace sherpa_onnx #endif // SHERPA_ONNX_CSRC_OFFLINE_SENSE_VOICE_MODEL_META_DATA_H_