Remove the 30-second constraint from whisper. (#471)

2023-12-07 17:47:08 +08:00
parent a7d69359c9
commit 3ae984f148
10 changed files with 178 additions and 78 deletions
--- a/scripts/whisper/test.py
+++ b/scripts/whisper/test.py
@@ -253,8 +253,21 @@ def compute_features(filename: str) -> torch.Tensor:
    log_spec = torch.clamp(features, min=1e-10).log10()
    log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
    mel = (log_spec + 4.0) / 4.0
+    # mel (T, 80)
+
+    # We pad 50 frames at the end so that it is able to detect eot
+    # You can use another value instead of 50.
+    mel = torch.nn.functional.pad(mel, (0, 0, 0, 50), "constant", 0)
+    # Note that if it throws for a multilingual model,
+    # please use a larger value, say 300
+
    target = 3000
-    mel = torch.nn.functional.pad(mel, (0, 0, 0, target - mel.shape[0]), "constant", 0)
+    if mel.shape[0] > target:
+        mel = mel[:target]
+
+    # We don't need to pad it to 30 seconds now!
+    #  mel = torch.nn.functional.pad(mel, (0, 0, 0, target - mel.shape[0]), "constant", 0)
+
    mel = mel.t().unsqueeze(0)

    return mel