Remove the 30-second constraint from whisper. (#471)
This commit is contained in:
@@ -253,8 +253,21 @@ def compute_features(filename: str) -> torch.Tensor:
|
||||
log_spec = torch.clamp(features, min=1e-10).log10()
|
||||
log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
|
||||
mel = (log_spec + 4.0) / 4.0
|
||||
# mel (T, 80)
|
||||
|
||||
# We pad 50 frames at the end so that it is able to detect eot
|
||||
# You can use another value instead of 50.
|
||||
mel = torch.nn.functional.pad(mel, (0, 0, 0, 50), "constant", 0)
|
||||
# Note that if it throws for a multilingual model,
|
||||
# please use a larger value, say 300
|
||||
|
||||
target = 3000
|
||||
mel = torch.nn.functional.pad(mel, (0, 0, 0, target - mel.shape[0]), "constant", 0)
|
||||
if mel.shape[0] > target:
|
||||
mel = mel[:target]
|
||||
|
||||
# We don't need to pad it to 30 seconds now!
|
||||
# mel = torch.nn.functional.pad(mel, (0, 0, 0, target - mel.shape[0]), "constant", 0)
|
||||
|
||||
mel = mel.t().unsqueeze(0)
|
||||
|
||||
return mel
|
||||
|
||||
Reference in New Issue
Block a user