Remove the 30-second constraint from whisper. (#471)
This commit is contained in:
@@ -115,7 +115,27 @@ class OfflineRecognizerWhisperImpl : public OfflineRecognizerImpl {
|
||||
|
||||
NormalizeFeatures(f.data(), num_frames, feat_dim);
|
||||
|
||||
std::array<int64_t, 3> shape{1, max_num_frames, feat_dim};
|
||||
// note that 50 is an experience value.
|
||||
// see also ../../scripts/whisper/test.py
|
||||
//
|
||||
// You can replace 50 by other values, say, 100.
|
||||
//
|
||||
// Since we have removed the 30 seconds constraint, we need
|
||||
// tail_padding_frames so that whisper is able to detect the eot token.
|
||||
int32_t tail_padding_frames = 50;
|
||||
if (model_->IsMultiLingual()) {
|
||||
// 300 is an experience value. If it throws, please use a larger value.
|
||||
tail_padding_frames = 300;
|
||||
}
|
||||
|
||||
if (config_.model_config.whisper.tail_paddings > 0) {
|
||||
tail_padding_frames = config_.model_config.whisper.tail_paddings;
|
||||
}
|
||||
|
||||
int32_t actual_frames =
|
||||
std::min(num_frames + tail_padding_frames, max_num_frames);
|
||||
|
||||
std::array<int64_t, 3> shape{1, actual_frames, feat_dim};
|
||||
|
||||
Ort::Value mel = Ort::Value::CreateTensor<float>(
|
||||
model_->Allocator(), shape.data(), shape.size());
|
||||
@@ -123,7 +143,7 @@ class OfflineRecognizerWhisperImpl : public OfflineRecognizerImpl {
|
||||
std::copy(f.begin(), f.end(), p_mel);
|
||||
|
||||
memset(p_mel + f.size(), 0,
|
||||
(max_num_frames - num_frames) * feat_dim * sizeof(float));
|
||||
(actual_frames - num_frames) * feat_dim * sizeof(float));
|
||||
mel = Transpose12(model_->Allocator(), &mel);
|
||||
|
||||
try {
|
||||
|
||||
@@ -32,6 +32,14 @@ void OfflineWhisperModelConfig::Register(ParseOptions *po) {
|
||||
"Valid values: transcribe, translate. "
|
||||
"Note that for non-multilingual models, it supports "
|
||||
"only 'transcribe'");
|
||||
|
||||
po->Register(
|
||||
"whisper-tail-paddings", &tail_paddings,
|
||||
"Suggest value: 50 for English models. 300 for multilingual models. "
|
||||
"Since we have removed the 30-second constraint, we need to add some "
|
||||
"tail padding frames "
|
||||
"so that whisper can detect the eot token. Leave it to -1 to use 50 for "
|
||||
"English models and 300 for multilingual models.");
|
||||
}
|
||||
|
||||
bool OfflineWhisperModelConfig::Validate() const {
|
||||
@@ -63,7 +71,8 @@ std::string OfflineWhisperModelConfig::ToString() const {
|
||||
os << "encoder=\"" << encoder << "\", ";
|
||||
os << "decoder=\"" << decoder << "\", ";
|
||||
os << "language=\"" << language << "\", ";
|
||||
os << "task=\"" << task << "\")";
|
||||
os << "task=\"" << task << "\", ";
|
||||
os << "tail_paddings=" << tail_paddings << ")";
|
||||
|
||||
return os.str();
|
||||
}
|
||||
|
||||
@@ -28,12 +28,26 @@ struct OfflineWhisperModelConfig {
|
||||
// Note: For non-multilingual models, it supports only "transcribe"
|
||||
std::string task = "transcribe";
|
||||
|
||||
// Number of tail padding frames.
|
||||
//
|
||||
// Since we remove the 30-second constraint, we need to add some paddings
|
||||
// at the end.
|
||||
//
|
||||
// Recommended values:
|
||||
// - 50 for English models
|
||||
// - 300 for multilingual models
|
||||
int32_t tail_paddings = -1;
|
||||
|
||||
OfflineWhisperModelConfig() = default;
|
||||
OfflineWhisperModelConfig(const std::string &encoder,
|
||||
const std::string &decoder,
|
||||
const std::string &language,
|
||||
const std::string &task)
|
||||
: encoder(encoder), decoder(decoder), language(language), task(task) {}
|
||||
const std::string &task, int32_t tail_paddings)
|
||||
: encoder(encoder),
|
||||
decoder(decoder),
|
||||
language(language),
|
||||
task(task),
|
||||
tail_paddings(tail_paddings) {}
|
||||
|
||||
void Register(ParseOptions *po);
|
||||
bool Validate() const;
|
||||
|
||||
Reference in New Issue
Block a user