Remove the 30-second constraint from whisper. (#471)

This commit is contained in:
Fangjun Kuang
2023-12-07 17:47:08 +08:00
committed by GitHub
parent a7d69359c9
commit 3ae984f148
10 changed files with 178 additions and 78 deletions

View File

@@ -253,8 +253,21 @@ def compute_features(filename: str) -> torch.Tensor:
log_spec = torch.clamp(features, min=1e-10).log10()
log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
mel = (log_spec + 4.0) / 4.0
# mel (T, 80)
# We pad 50 frames at the end so that it is able to detect eot
# You can use another value instead of 50.
mel = torch.nn.functional.pad(mel, (0, 0, 0, 50), "constant", 0)
# Note that if it throws for a multilingual model,
# please use a larger value, say 300
target = 3000
mel = torch.nn.functional.pad(mel, (0, 0, 0, target - mel.shape[0]), "constant", 0)
if mel.shape[0] > target:
mel = mel[:target]
# We don't need to pad it to 30 seconds now!
# mel = torch.nn.functional.pad(mel, (0, 0, 0, target - mel.shape[0]), "constant", 0)
mel = mel.t().unsqueeze(0)
return mel