198 lines
8.3 KiB
Python
198 lines
8.3 KiB
Python
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
"""Testing suite for the Parakeet feature extraction."""
|
|
|
|
import itertools
|
|
import random
|
|
import unittest
|
|
|
|
import numpy as np
|
|
|
|
from transformers import ParakeetFeatureExtractor
|
|
from transformers.testing_utils import require_torch
|
|
from transformers.utils import is_datasets_available, is_torch_available
|
|
|
|
from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
|
|
|
|
|
|
if is_torch_available():
|
|
import torch
|
|
|
|
if is_datasets_available():
|
|
from datasets import load_dataset
|
|
|
|
global_rng = random.Random()
|
|
|
|
|
|
def floats_list(shape, scale=1.0, rng=None, name=None):
|
|
"""Creates a random float32 tensor"""
|
|
if rng is None:
|
|
rng = global_rng
|
|
|
|
values = []
|
|
for batch_idx in range(shape[0]):
|
|
values.append([])
|
|
for _ in range(shape[1]):
|
|
values[-1].append(rng.random() * scale)
|
|
|
|
return values
|
|
|
|
|
|
class ParakeetFeatureExtractionTester:
|
|
def __init__(
|
|
self,
|
|
parent,
|
|
batch_size=7,
|
|
min_seq_length=400,
|
|
max_seq_length=2000,
|
|
feature_size=80,
|
|
hop_length=160,
|
|
win_length=400,
|
|
n_fft=512,
|
|
sampling_rate=16000,
|
|
padding_value=0.0,
|
|
):
|
|
self.parent = parent
|
|
self.batch_size = batch_size
|
|
self.min_seq_length = min_seq_length
|
|
self.max_seq_length = max_seq_length
|
|
self.seq_length_diff = (self.max_seq_length - self.min_seq_length) // (self.batch_size - 1)
|
|
self.feature_size = feature_size
|
|
self.hop_length = hop_length
|
|
self.win_length = win_length
|
|
self.n_fft = n_fft
|
|
self.sampling_rate = sampling_rate
|
|
self.padding_value = padding_value
|
|
|
|
def prepare_feat_extract_dict(self):
|
|
return {
|
|
"feature_size": self.feature_size,
|
|
"hop_length": self.hop_length,
|
|
"win_length": self.win_length,
|
|
"n_fft": self.n_fft,
|
|
"sampling_rate": self.sampling_rate,
|
|
"padding_value": self.padding_value,
|
|
}
|
|
|
|
# Copied from tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTester.prepare_inputs_for_common
|
|
def prepare_inputs_for_common(self, equal_length=False, numpify=False):
|
|
def _flatten(list_of_lists):
|
|
return list(itertools.chain(*list_of_lists))
|
|
|
|
if equal_length:
|
|
speech_inputs = [floats_list((self.max_seq_length, self.feature_size)) for _ in range(self.batch_size)]
|
|
else:
|
|
# make sure that inputs increase in size
|
|
speech_inputs = [
|
|
floats_list((x, self.feature_size))
|
|
for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff)
|
|
]
|
|
if numpify:
|
|
speech_inputs = [np.asarray(x) for x in speech_inputs]
|
|
return speech_inputs
|
|
|
|
|
|
class ParakeetFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
|
|
feature_extraction_class = ParakeetFeatureExtractor
|
|
|
|
def setUp(self):
|
|
self.feat_extract_tester = ParakeetFeatureExtractionTester(self)
|
|
|
|
def _load_datasamples(self, num_samples):
|
|
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
|
# automatic decoding with librispeech
|
|
speech_samples = ds.sort("id")[:num_samples]["audio"]
|
|
|
|
return [x["array"] for x in speech_samples]
|
|
|
|
@require_torch
|
|
def test_torch_integration(self):
|
|
"""
|
|
reproducer: https://gist.github.com/eustlb/c4a0999e54466b7e8d8b040d8e0900df
|
|
"""
|
|
# fmt: off
|
|
EXPECTED_INPUT_FEATURES = torch.tensor(
|
|
[
|
|
0.60935932, 1.18187428, 1.29877627, 1.36461377, 1.09311509, 1.39821815,
|
|
1.63753450, 1.37100816, 1.26510608, 1.70332706, 1.69067430, 1.28770995,
|
|
1.52999651, 1.77962756, 1.71420062, 1.21944094, 1.30884087, 1.44343364,
|
|
1.17694926, 1.42690814, 1.78877723, 1.68655288, 1.27155364, 1.66103351,
|
|
1.75820673, 1.41575801, 1.40622294, 1.70603478, 1.63117850, 1.13353217,
|
|
]
|
|
)
|
|
# fmt: on
|
|
|
|
input_speech = self._load_datasamples(1)
|
|
feature_extractor = ParakeetFeatureExtractor()
|
|
inputs = feature_extractor(input_speech, return_tensors="pt")
|
|
|
|
self.assertEqual(inputs.input_features.shape, (1, 586, 80))
|
|
torch.testing.assert_close(inputs.input_features[0, 100, :30], EXPECTED_INPUT_FEATURES, atol=1e-4, rtol=1e-4)
|
|
|
|
self.assertEqual(inputs.attention_mask.shape, (1, 586))
|
|
# last frame should be masked
|
|
self.assertEqual(inputs.attention_mask.sum(), 585)
|
|
|
|
@require_torch
|
|
def test_torch_integration_batch(self):
|
|
"""
|
|
reproducer: https://gist.github.com/eustlb/c4a0999e54466b7e8d8b040d8e0900df
|
|
"""
|
|
# fmt: off
|
|
EXPECTED_INPUT_FEATURES = torch.tensor(
|
|
[
|
|
[ 0.60935932, 1.18187428, 1.29877627, 1.36461377, 1.09311533,
|
|
1.39821827, 1.63753450, 1.37100816, 1.26510608, 1.70332706,
|
|
1.69067478, 1.28770995, 1.52999651, 1.77962780, 1.71420062,
|
|
1.21944094, 1.30884087, 1.44343400, 1.17694926, 1.42690814,
|
|
1.78877664, 1.68655288, 1.27155364, 1.66103351, 1.75820673,
|
|
1.41575801, 1.40622294, 1.70603478, 1.63117862, 1.13353217],
|
|
[ 0.58339858, 0.54317272, 0.46222782, 0.34154415, 0.17806509,
|
|
0.32182255, 0.28909618, 0.02141305, -0.09710173, -0.35818669,
|
|
-0.48172510, -0.52942866, -0.58029658, -0.70519227, -0.67929971,
|
|
-0.54698551, -0.28611183, -0.24780270, -0.31363955, -0.41913241,
|
|
-0.32394424, -0.44897896, -0.68657434, -0.62047797, -0.46886450,
|
|
-0.65987164, -1.02435589, -0.58527517, -0.56095684, -0.73582536],
|
|
[-0.91937613, -0.97933632, -1.06843162, -1.02642107, -0.94232899,
|
|
-0.83840621, -0.82306921, -0.45763230, -0.45182887, -0.75917768,
|
|
-0.42541453, -0.28512970, -0.39637473, -0.66478080, -0.68004298,
|
|
-0.49690303, -0.31799242, -0.12917191, 0.13149273, 0.10163058,
|
|
-0.40041649, 0.05001565, 0.23906317, 0.28816083, 0.14308788,
|
|
-0.29588422, -0.05428466, 0.14418560, 0.28865972, -0.12138986],
|
|
[ 0.73217624, 0.84484011, 0.79323846, 0.66315967, 0.41556871,
|
|
0.88633078, 0.90718138, 0.91268104, 1.15920067, 1.26141894,
|
|
1.10222173, 0.92990804, 0.96352047, 0.88142169, 0.56635213,
|
|
0.71491158, 0.81301254, 0.67301887, 0.74780160, 0.64429688,
|
|
0.22885245, 0.47035533, 0.46498337, 0.17544533, 0.44458991,
|
|
0.79245001, 0.57207537, 0.85768145, 1.00491571, 0.93360955],
|
|
[ 1.40496337, 1.32492661, 1.16519547, 0.98379827, 0.77614164,
|
|
0.95871657, 0.81910741, 1.23010278, 1.33011520, 1.16538525,
|
|
1.28319681, 1.45041633, 1.33421600, 0.91677380, 0.67107433,
|
|
0.52890682, 0.82009870, 1.15821445, 1.15343642, 1.10958862,
|
|
1.44962490, 1.44485891, 1.46043479, 1.90800595, 1.95863307,
|
|
1.63670933, 1.49021459, 1.18701911, 0.74906683, 0.84700620]
|
|
]
|
|
)
|
|
# fmt: on
|
|
|
|
input_speech = self._load_datasamples(5)
|
|
feature_extractor = ParakeetFeatureExtractor()
|
|
inputs = feature_extractor(input_speech, return_tensors="pt")
|
|
|
|
self.assertEqual(inputs.input_features.shape, (5, 2941, 80))
|
|
torch.testing.assert_close(inputs.input_features[:, 100, :30], EXPECTED_INPUT_FEATURES, atol=1e-4, rtol=1e-4)
|
|
|
|
self.assertEqual(inputs.attention_mask.shape, (5, 2941))
|
|
self.assertTrue(inputs.attention_mask.sum(dim=-1).tolist(), [585, 481, 1248, 990, 2940])
|