Remove the 30-second constraint from whisper. (#471)

This commit is contained in:
Fangjun Kuang
2023-12-07 17:47:08 +08:00
committed by GitHub
parent a7d69359c9
commit 3ae984f148
10 changed files with 178 additions and 78 deletions

View File

@@ -115,7 +115,27 @@ class OfflineRecognizerWhisperImpl : public OfflineRecognizerImpl {
NormalizeFeatures(f.data(), num_frames, feat_dim);
std::array<int64_t, 3> shape{1, max_num_frames, feat_dim};
// note that 50 is an experience value.
// see also ../../scripts/whisper/test.py
//
// You can replace 50 by other values, say, 100.
//
// Since we have removed the 30 seconds constraint, we need
// tail_padding_frames so that whisper is able to detect the eot token.
int32_t tail_padding_frames = 50;
if (model_->IsMultiLingual()) {
// 300 is an experience value. If it throws, please use a larger value.
tail_padding_frames = 300;
}
if (config_.model_config.whisper.tail_paddings > 0) {
tail_padding_frames = config_.model_config.whisper.tail_paddings;
}
int32_t actual_frames =
std::min(num_frames + tail_padding_frames, max_num_frames);
std::array<int64_t, 3> shape{1, actual_frames, feat_dim};
Ort::Value mel = Ort::Value::CreateTensor<float>(
model_->Allocator(), shape.data(), shape.size());
@@ -123,7 +143,7 @@ class OfflineRecognizerWhisperImpl : public OfflineRecognizerImpl {
std::copy(f.begin(), f.end(), p_mel);
memset(p_mel + f.size(), 0,
(max_num_frames - num_frames) * feat_dim * sizeof(float));
(actual_frames - num_frames) * feat_dim * sizeof(float));
mel = Transpose12(model_->Allocator(), &mel);
try {

View File

@@ -32,6 +32,14 @@ void OfflineWhisperModelConfig::Register(ParseOptions *po) {
"Valid values: transcribe, translate. "
"Note that for non-multilingual models, it supports "
"only 'transcribe'");
po->Register(
"whisper-tail-paddings", &tail_paddings,
"Suggest value: 50 for English models. 300 for multilingual models. "
"Since we have removed the 30-second constraint, we need to add some "
"tail padding frames "
"so that whisper can detect the eot token. Leave it to -1 to use 50 for "
"English models and 300 for multilingual models.");
}
bool OfflineWhisperModelConfig::Validate() const {
@@ -63,7 +71,8 @@ std::string OfflineWhisperModelConfig::ToString() const {
os << "encoder=\"" << encoder << "\", ";
os << "decoder=\"" << decoder << "\", ";
os << "language=\"" << language << "\", ";
os << "task=\"" << task << "\")";
os << "task=\"" << task << "\", ";
os << "tail_paddings=" << tail_paddings << ")";
return os.str();
}

View File

@@ -28,12 +28,26 @@ struct OfflineWhisperModelConfig {
// Note: For non-multilingual models, it supports only "transcribe"
std::string task = "transcribe";
// Number of tail padding frames.
//
// Since we remove the 30-second constraint, we need to add some paddings
// at the end.
//
// Recommended values:
// - 50 for English models
// - 300 for multilingual models
int32_t tail_paddings = -1;
OfflineWhisperModelConfig() = default;
OfflineWhisperModelConfig(const std::string &encoder,
const std::string &decoder,
const std::string &language,
const std::string &task)
: encoder(encoder), decoder(decoder), language(language), task(task) {}
const std::string &task, int32_t tail_paddings)
: encoder(encoder),
decoder(decoder),
language(language),
task(task),
tail_paddings(tail_paddings) {}
void Register(ParseOptions *po);
bool Validate() const;