init
This commit is contained in:
0
transformers/tests/models/parakeet/__init__.py
Normal file
0
transformers/tests/models/parakeet/__init__.py
Normal file
@@ -0,0 +1,197 @@
|
||||
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Testing suite for the Parakeet feature extraction."""
|
||||
|
||||
import itertools
|
||||
import random
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
|
||||
from transformers import ParakeetFeatureExtractor
|
||||
from transformers.testing_utils import require_torch
|
||||
from transformers.utils import is_datasets_available, is_torch_available
|
||||
|
||||
from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
if is_datasets_available():
|
||||
from datasets import load_dataset
|
||||
|
||||
global_rng = random.Random()
|
||||
|
||||
|
||||
def floats_list(shape, scale=1.0, rng=None, name=None):
|
||||
"""Creates a random float32 tensor"""
|
||||
if rng is None:
|
||||
rng = global_rng
|
||||
|
||||
values = []
|
||||
for batch_idx in range(shape[0]):
|
||||
values.append([])
|
||||
for _ in range(shape[1]):
|
||||
values[-1].append(rng.random() * scale)
|
||||
|
||||
return values
|
||||
|
||||
|
||||
class ParakeetFeatureExtractionTester:
|
||||
def __init__(
|
||||
self,
|
||||
parent,
|
||||
batch_size=7,
|
||||
min_seq_length=400,
|
||||
max_seq_length=2000,
|
||||
feature_size=80,
|
||||
hop_length=160,
|
||||
win_length=400,
|
||||
n_fft=512,
|
||||
sampling_rate=16000,
|
||||
padding_value=0.0,
|
||||
):
|
||||
self.parent = parent
|
||||
self.batch_size = batch_size
|
||||
self.min_seq_length = min_seq_length
|
||||
self.max_seq_length = max_seq_length
|
||||
self.seq_length_diff = (self.max_seq_length - self.min_seq_length) // (self.batch_size - 1)
|
||||
self.feature_size = feature_size
|
||||
self.hop_length = hop_length
|
||||
self.win_length = win_length
|
||||
self.n_fft = n_fft
|
||||
self.sampling_rate = sampling_rate
|
||||
self.padding_value = padding_value
|
||||
|
||||
def prepare_feat_extract_dict(self):
|
||||
return {
|
||||
"feature_size": self.feature_size,
|
||||
"hop_length": self.hop_length,
|
||||
"win_length": self.win_length,
|
||||
"n_fft": self.n_fft,
|
||||
"sampling_rate": self.sampling_rate,
|
||||
"padding_value": self.padding_value,
|
||||
}
|
||||
|
||||
# Copied from tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTester.prepare_inputs_for_common
|
||||
def prepare_inputs_for_common(self, equal_length=False, numpify=False):
|
||||
def _flatten(list_of_lists):
|
||||
return list(itertools.chain(*list_of_lists))
|
||||
|
||||
if equal_length:
|
||||
speech_inputs = [floats_list((self.max_seq_length, self.feature_size)) for _ in range(self.batch_size)]
|
||||
else:
|
||||
# make sure that inputs increase in size
|
||||
speech_inputs = [
|
||||
floats_list((x, self.feature_size))
|
||||
for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff)
|
||||
]
|
||||
if numpify:
|
||||
speech_inputs = [np.asarray(x) for x in speech_inputs]
|
||||
return speech_inputs
|
||||
|
||||
|
||||
class ParakeetFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
|
||||
feature_extraction_class = ParakeetFeatureExtractor
|
||||
|
||||
def setUp(self):
|
||||
self.feat_extract_tester = ParakeetFeatureExtractionTester(self)
|
||||
|
||||
def _load_datasamples(self, num_samples):
|
||||
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
||||
# automatic decoding with librispeech
|
||||
speech_samples = ds.sort("id")[:num_samples]["audio"]
|
||||
|
||||
return [x["array"] for x in speech_samples]
|
||||
|
||||
@require_torch
|
||||
def test_torch_integration(self):
|
||||
"""
|
||||
reproducer: https://gist.github.com/eustlb/c4a0999e54466b7e8d8b040d8e0900df
|
||||
"""
|
||||
# fmt: off
|
||||
EXPECTED_INPUT_FEATURES = torch.tensor(
|
||||
[
|
||||
0.60935932, 1.18187428, 1.29877627, 1.36461377, 1.09311509, 1.39821815,
|
||||
1.63753450, 1.37100816, 1.26510608, 1.70332706, 1.69067430, 1.28770995,
|
||||
1.52999651, 1.77962756, 1.71420062, 1.21944094, 1.30884087, 1.44343364,
|
||||
1.17694926, 1.42690814, 1.78877723, 1.68655288, 1.27155364, 1.66103351,
|
||||
1.75820673, 1.41575801, 1.40622294, 1.70603478, 1.63117850, 1.13353217,
|
||||
]
|
||||
)
|
||||
# fmt: on
|
||||
|
||||
input_speech = self._load_datasamples(1)
|
||||
feature_extractor = ParakeetFeatureExtractor()
|
||||
inputs = feature_extractor(input_speech, return_tensors="pt")
|
||||
|
||||
self.assertEqual(inputs.input_features.shape, (1, 586, 80))
|
||||
torch.testing.assert_close(inputs.input_features[0, 100, :30], EXPECTED_INPUT_FEATURES, atol=1e-4, rtol=1e-4)
|
||||
|
||||
self.assertEqual(inputs.attention_mask.shape, (1, 586))
|
||||
# last frame should be masked
|
||||
self.assertEqual(inputs.attention_mask.sum(), 585)
|
||||
|
||||
@require_torch
|
||||
def test_torch_integration_batch(self):
|
||||
"""
|
||||
reproducer: https://gist.github.com/eustlb/c4a0999e54466b7e8d8b040d8e0900df
|
||||
"""
|
||||
# fmt: off
|
||||
EXPECTED_INPUT_FEATURES = torch.tensor(
|
||||
[
|
||||
[ 0.60935932, 1.18187428, 1.29877627, 1.36461377, 1.09311533,
|
||||
1.39821827, 1.63753450, 1.37100816, 1.26510608, 1.70332706,
|
||||
1.69067478, 1.28770995, 1.52999651, 1.77962780, 1.71420062,
|
||||
1.21944094, 1.30884087, 1.44343400, 1.17694926, 1.42690814,
|
||||
1.78877664, 1.68655288, 1.27155364, 1.66103351, 1.75820673,
|
||||
1.41575801, 1.40622294, 1.70603478, 1.63117862, 1.13353217],
|
||||
[ 0.58339858, 0.54317272, 0.46222782, 0.34154415, 0.17806509,
|
||||
0.32182255, 0.28909618, 0.02141305, -0.09710173, -0.35818669,
|
||||
-0.48172510, -0.52942866, -0.58029658, -0.70519227, -0.67929971,
|
||||
-0.54698551, -0.28611183, -0.24780270, -0.31363955, -0.41913241,
|
||||
-0.32394424, -0.44897896, -0.68657434, -0.62047797, -0.46886450,
|
||||
-0.65987164, -1.02435589, -0.58527517, -0.56095684, -0.73582536],
|
||||
[-0.91937613, -0.97933632, -1.06843162, -1.02642107, -0.94232899,
|
||||
-0.83840621, -0.82306921, -0.45763230, -0.45182887, -0.75917768,
|
||||
-0.42541453, -0.28512970, -0.39637473, -0.66478080, -0.68004298,
|
||||
-0.49690303, -0.31799242, -0.12917191, 0.13149273, 0.10163058,
|
||||
-0.40041649, 0.05001565, 0.23906317, 0.28816083, 0.14308788,
|
||||
-0.29588422, -0.05428466, 0.14418560, 0.28865972, -0.12138986],
|
||||
[ 0.73217624, 0.84484011, 0.79323846, 0.66315967, 0.41556871,
|
||||
0.88633078, 0.90718138, 0.91268104, 1.15920067, 1.26141894,
|
||||
1.10222173, 0.92990804, 0.96352047, 0.88142169, 0.56635213,
|
||||
0.71491158, 0.81301254, 0.67301887, 0.74780160, 0.64429688,
|
||||
0.22885245, 0.47035533, 0.46498337, 0.17544533, 0.44458991,
|
||||
0.79245001, 0.57207537, 0.85768145, 1.00491571, 0.93360955],
|
||||
[ 1.40496337, 1.32492661, 1.16519547, 0.98379827, 0.77614164,
|
||||
0.95871657, 0.81910741, 1.23010278, 1.33011520, 1.16538525,
|
||||
1.28319681, 1.45041633, 1.33421600, 0.91677380, 0.67107433,
|
||||
0.52890682, 0.82009870, 1.15821445, 1.15343642, 1.10958862,
|
||||
1.44962490, 1.44485891, 1.46043479, 1.90800595, 1.95863307,
|
||||
1.63670933, 1.49021459, 1.18701911, 0.74906683, 0.84700620]
|
||||
]
|
||||
)
|
||||
# fmt: on
|
||||
|
||||
input_speech = self._load_datasamples(5)
|
||||
feature_extractor = ParakeetFeatureExtractor()
|
||||
inputs = feature_extractor(input_speech, return_tensors="pt")
|
||||
|
||||
self.assertEqual(inputs.input_features.shape, (5, 2941, 80))
|
||||
torch.testing.assert_close(inputs.input_features[:, 100, :30], EXPECTED_INPUT_FEATURES, atol=1e-4, rtol=1e-4)
|
||||
|
||||
self.assertEqual(inputs.attention_mask.shape, (5, 2941))
|
||||
self.assertTrue(inputs.attention_mask.sum(dim=-1).tolist(), [585, 481, 1248, 990, 2940])
|
||||
380
transformers/tests/models/parakeet/test_modeling_parakeet.py
Normal file
380
transformers/tests/models/parakeet/test_modeling_parakeet.py
Normal file
@@ -0,0 +1,380 @@
|
||||
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Testing suite for the PyTorch Parakeet model."""
|
||||
|
||||
import json
|
||||
import tempfile
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
|
||||
from transformers import is_datasets_available, is_torch_available
|
||||
from transformers.testing_utils import cleanup, require_torch, slow, torch_device
|
||||
|
||||
from ...test_configuration_common import ConfigTester
|
||||
from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
|
||||
|
||||
|
||||
if is_datasets_available():
|
||||
from datasets import Audio, load_dataset
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
from transformers import (
|
||||
AutoProcessor,
|
||||
ParakeetCTCConfig,
|
||||
ParakeetEncoder,
|
||||
ParakeetEncoderConfig,
|
||||
ParakeetForCTC,
|
||||
)
|
||||
|
||||
|
||||
class ParakeetEncoderModelTester:
|
||||
def __init__(
|
||||
self,
|
||||
parent,
|
||||
batch_size=13,
|
||||
seq_length=1024,
|
||||
is_training=True,
|
||||
hidden_size=64,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=256,
|
||||
hidden_act="silu",
|
||||
dropout=0, # so gradient checkpointing doesn't fail
|
||||
conv_kernel_size=9,
|
||||
subsampling_factor=8,
|
||||
subsampling_conv_channels=32,
|
||||
use_bias=True,
|
||||
num_mel_bins=80,
|
||||
scale_input=True,
|
||||
):
|
||||
# testing suite parameters
|
||||
self.parent = parent
|
||||
self.batch_size = batch_size
|
||||
self.seq_length = seq_length
|
||||
self.num_mel_bins = num_mel_bins
|
||||
self.is_training = is_training
|
||||
|
||||
# config parameters
|
||||
self.hidden_size = hidden_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.intermediate_size = intermediate_size
|
||||
self.hidden_act = hidden_act
|
||||
self.dropout = dropout
|
||||
self.conv_kernel_size = conv_kernel_size
|
||||
self.subsampling_factor = subsampling_factor
|
||||
self.subsampling_conv_channels = subsampling_conv_channels
|
||||
self.use_bias = use_bias
|
||||
self.num_mel_bins = num_mel_bins
|
||||
self.scale_input = scale_input
|
||||
|
||||
# Calculate output sequence length after subsampling
|
||||
self.output_seq_length = seq_length // subsampling_factor
|
||||
self.encoder_seq_length = self.output_seq_length
|
||||
self.key_length = self.output_seq_length
|
||||
|
||||
def prepare_config_and_inputs(self):
|
||||
input_features = floats_tensor([self.batch_size, self.seq_length, self.num_mel_bins])
|
||||
attention_mask = random_attention_mask([self.batch_size, self.seq_length])
|
||||
config = self.get_config()
|
||||
|
||||
return config, input_features, attention_mask
|
||||
|
||||
def get_config(self):
|
||||
return ParakeetEncoderConfig(
|
||||
hidden_size=self.hidden_size,
|
||||
num_hidden_layers=self.num_hidden_layers,
|
||||
num_attention_heads=self.num_attention_heads,
|
||||
intermediate_size=self.intermediate_size,
|
||||
hidden_act=self.hidden_act,
|
||||
dropout=self.dropout,
|
||||
dropout_positions=self.dropout,
|
||||
layerdrop=self.dropout,
|
||||
activation_dropout=self.dropout,
|
||||
attention_dropout=self.dropout,
|
||||
conv_kernel_size=self.conv_kernel_size,
|
||||
subsampling_factor=self.subsampling_factor,
|
||||
subsampling_conv_channels=self.subsampling_conv_channels,
|
||||
use_bias=self.use_bias,
|
||||
num_mel_bins=self.num_mel_bins,
|
||||
scale_input=self.scale_input,
|
||||
)
|
||||
|
||||
def create_and_check_model(self, config, input_features, attention_mask):
|
||||
model = ParakeetEncoder(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
with torch.no_grad():
|
||||
result = model(input_features, attention_mask=attention_mask)
|
||||
|
||||
self.parent.assertEqual(
|
||||
result.last_hidden_state.shape, (self.batch_size, self.output_seq_length, config.hidden_size)
|
||||
)
|
||||
|
||||
def prepare_config_and_inputs_for_common(self):
|
||||
config, input_features, attention_mask = self.prepare_config_and_inputs()
|
||||
inputs_dict = {
|
||||
"input_features": input_features,
|
||||
"attention_mask": attention_mask,
|
||||
}
|
||||
return config, inputs_dict
|
||||
|
||||
def check_ctc_loss(self, config, input_values, *args):
|
||||
model = ParakeetForCTC(config=config)
|
||||
model.to(torch_device)
|
||||
|
||||
# make sure that dropout is disabled
|
||||
model.eval()
|
||||
|
||||
input_values = input_values[:3]
|
||||
attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.long)
|
||||
|
||||
input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
|
||||
max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths))
|
||||
labels = ids_tensor((input_values.shape[0], min(max_length_labels) - 1), model.config.vocab_size)
|
||||
|
||||
# pad input
|
||||
for i in range(len(input_lengths)):
|
||||
input_values[i, input_lengths[i] :] = 0.0
|
||||
attention_mask[i, input_lengths[i] :] = 0
|
||||
|
||||
model.config.ctc_loss_reduction = "sum"
|
||||
sum_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
|
||||
|
||||
model.config.ctc_loss_reduction = "mean"
|
||||
mean_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
|
||||
|
||||
self.parent.assertTrue(isinstance(sum_loss, float))
|
||||
self.parent.assertTrue(isinstance(mean_loss, float))
|
||||
|
||||
|
||||
@require_torch
|
||||
class ParakeetEncoderModelTest(ModelTesterMixin, unittest.TestCase):
|
||||
all_model_classes = (ParakeetEncoder,) if is_torch_available() else ()
|
||||
|
||||
test_pruning = False
|
||||
test_resize_embeddings = False
|
||||
test_head_masking = False
|
||||
test_torch_exportable = True
|
||||
|
||||
def setUp(self):
|
||||
self.model_tester = ParakeetEncoderModelTester(self)
|
||||
self.config_tester = ConfigTester(self, config_class=ParakeetEncoderConfig, has_text_modality=False)
|
||||
|
||||
def test_config(self):
|
||||
self.config_tester.run_common_tests()
|
||||
|
||||
def test_model(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_model(*config_and_inputs)
|
||||
|
||||
@unittest.skip(reason="ParakeetEncoder does not use inputs_embeds")
|
||||
def test_model_get_set_embeddings(self):
|
||||
pass
|
||||
|
||||
|
||||
class ParakeetForCTCModelTester:
|
||||
def __init__(self, parent, encoder_kwargs=None, is_training=True, vocab_size=128, pad_token_id=0):
|
||||
if encoder_kwargs is None:
|
||||
encoder_kwargs = {}
|
||||
|
||||
self.parent = parent
|
||||
self.encoder_model_tester = ParakeetEncoderModelTester(parent, **encoder_kwargs)
|
||||
self.is_training = is_training
|
||||
|
||||
self.batch_size = self.encoder_model_tester.batch_size
|
||||
self.output_seq_length = self.encoder_model_tester.output_seq_length
|
||||
self.num_hidden_layers = self.encoder_model_tester.num_hidden_layers
|
||||
self.seq_length = vocab_size
|
||||
self.hidden_size = self.encoder_model_tester.hidden_size
|
||||
|
||||
self.vocab_size = vocab_size
|
||||
self.pad_token_id = pad_token_id
|
||||
|
||||
def prepare_config_and_inputs(self):
|
||||
_, input_features, attention_mask = self.encoder_model_tester.prepare_config_and_inputs()
|
||||
config = self.get_config()
|
||||
return config, input_features, attention_mask
|
||||
|
||||
def get_config(self):
|
||||
return ParakeetCTCConfig.from_encoder_config(
|
||||
encoder_config=self.encoder_model_tester.get_config(),
|
||||
vocab_size=self.vocab_size,
|
||||
pad_token_id=self.pad_token_id,
|
||||
)
|
||||
|
||||
def create_and_check_model(self, config, input_features, attention_mask):
|
||||
model = ParakeetForCTC(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
with torch.no_grad():
|
||||
result = model(input_features, attention_mask=attention_mask)
|
||||
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.output_seq_length, self.vocab_size))
|
||||
|
||||
def prepare_config_and_inputs_for_common(self):
|
||||
config, input_features, attention_mask = self.prepare_config_and_inputs()
|
||||
inputs_dict = {
|
||||
"input_features": input_features,
|
||||
"attention_mask": attention_mask,
|
||||
}
|
||||
return config, inputs_dict
|
||||
|
||||
def test_ctc_loss_inference(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.encoder_model_tester.check_ctc_loss(*config_and_inputs)
|
||||
|
||||
|
||||
@require_torch
|
||||
class ParakeetForCTCModelTest(ModelTesterMixin, unittest.TestCase):
|
||||
all_model_classes = (ParakeetForCTC,) if is_torch_available() else ()
|
||||
pipeline_model_mapping = (
|
||||
{
|
||||
"feature-extraction": ParakeetEncoder,
|
||||
"automatic-speech-recognition": ParakeetForCTC,
|
||||
}
|
||||
if is_torch_available()
|
||||
else {}
|
||||
)
|
||||
|
||||
test_attention_outputs = False
|
||||
test_pruning = False
|
||||
test_resize_embeddings = False
|
||||
test_head_masking = False
|
||||
test_torch_exportable = True
|
||||
|
||||
_is_composite = True
|
||||
|
||||
def setUp(self):
|
||||
self.model_tester = ParakeetForCTCModelTester(self)
|
||||
self.config_tester = ConfigTester(self, config_class=ParakeetCTCConfig)
|
||||
|
||||
def test_config(self):
|
||||
self.config_tester.run_common_tests()
|
||||
|
||||
def test_model(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_model(*config_and_inputs)
|
||||
|
||||
@unittest.skip(reason="ParakeetEncoder does not use inputs_embeds")
|
||||
def test_model_get_set_embeddings(self):
|
||||
pass
|
||||
|
||||
# Original function assumes vision+text model, so overwrite since Parakeet is audio+text
|
||||
# Below is modified from `tests/models/granite_speech/test_modeling_granite_speech.py`
|
||||
def test_sdpa_can_dispatch_composite_models(self):
|
||||
if not self.has_attentions:
|
||||
self.skipTest(reason="Model architecture does not support attentions")
|
||||
|
||||
if not self._is_composite:
|
||||
self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA")
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
model = model_class(config)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
model.save_pretrained(tmpdirname)
|
||||
model_sdpa = model_class.from_pretrained(tmpdirname)
|
||||
model_sdpa = model_sdpa.eval().to(torch_device)
|
||||
|
||||
model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager")
|
||||
model_eager = model_eager.eval().to(torch_device)
|
||||
self.assertTrue(model_eager.config._attn_implementation == "eager")
|
||||
|
||||
for name, submodule in model_eager.named_modules():
|
||||
class_name = submodule.__class__.__name__
|
||||
if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
|
||||
raise ValueError("The eager model should not have SDPA attention layers")
|
||||
|
||||
|
||||
@require_torch
|
||||
class ParakeetForCTCIntegrationTest(unittest.TestCase):
|
||||
_dataset = None
|
||||
|
||||
@classmethod
|
||||
def setUp(cls):
|
||||
cls.checkpoint_name = "nvidia/parakeet-ctc-1.1b"
|
||||
cls.dtype = torch.bfloat16
|
||||
cls.processor = AutoProcessor.from_pretrained("nvidia/parakeet-ctc-1.1b")
|
||||
|
||||
def tearDown(self):
|
||||
cleanup(torch_device, gc_collect=True)
|
||||
|
||||
@classmethod
|
||||
def _load_dataset(cls):
|
||||
# Lazy loading of the dataset. Because it is a class method, it will only be loaded once per pytest process.
|
||||
if cls._dataset is None:
|
||||
cls._dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
||||
cls._dataset = cls._dataset.cast_column(
|
||||
"audio", Audio(sampling_rate=cls.processor.feature_extractor.sampling_rate)
|
||||
)
|
||||
|
||||
def _load_datasamples(self, num_samples):
|
||||
self._load_dataset()
|
||||
ds = self._dataset
|
||||
speech_samples = ds.sort("id")[:num_samples]["audio"]
|
||||
return [x["array"] for x in speech_samples]
|
||||
|
||||
@slow
|
||||
def test_1b_model_integration(self):
|
||||
"""
|
||||
bezzam reproducer (creates JSON directly in repo): https://gist.github.com/ebezzam/6382bdabfc64bb2541ca9f77deb7678d#file-reproducer_single-py
|
||||
eustlb reproducer: https://gist.github.com/eustlb/6e9e3aa85de3f7c340ec3c36e65f2fe6
|
||||
"""
|
||||
RESULTS_PATH = Path(__file__).parent.parent.parent / "fixtures/parakeet/expected_results_single.json"
|
||||
with open(RESULTS_PATH, "r") as f:
|
||||
raw_data = json.load(f)
|
||||
EXPECTED_TOKEN_IDS = torch.tensor(raw_data["token_ids"])
|
||||
EXPECTED_TRANSCRIPTIONS = raw_data["transcriptions"]
|
||||
|
||||
samples = self._load_datasamples(1)
|
||||
model = ParakeetForCTC.from_pretrained(self.checkpoint_name, torch_dtype=self.dtype, device_map=torch_device)
|
||||
model.eval()
|
||||
model.to(torch_device)
|
||||
|
||||
# -- apply
|
||||
inputs = self.processor(samples)
|
||||
inputs.to(torch_device, dtype=self.dtype)
|
||||
predicted_ids = model.generate(**inputs)
|
||||
torch.testing.assert_close(predicted_ids.cpu(), EXPECTED_TOKEN_IDS)
|
||||
predicted_transcripts = self.processor.batch_decode(predicted_ids, skip_special_tokens=True)
|
||||
self.assertListEqual(predicted_transcripts, EXPECTED_TRANSCRIPTIONS)
|
||||
|
||||
@slow
|
||||
def test_1b_model_integration_batched(self):
|
||||
"""
|
||||
bezzam reproducer (creates JSON directly in repo): https://gist.github.com/ebezzam/6382bdabfc64bb2541ca9f77deb7678d#file-reproducer_batched-py
|
||||
eustlb reproducer: https://gist.github.com/eustlb/575b5da58de34a70116a1955b1183596
|
||||
"""
|
||||
|
||||
RESULTS_PATH = Path(__file__).parent.parent.parent / "fixtures/parakeet/expected_results_batch.json"
|
||||
with open(RESULTS_PATH, "r") as f:
|
||||
raw_data = json.load(f)
|
||||
EXPECTED_TOKEN_IDS = torch.tensor(raw_data["token_ids"])
|
||||
EXPECTED_TRANSCRIPTIONS = raw_data["transcriptions"]
|
||||
|
||||
samples = self._load_datasamples(5)
|
||||
model = ParakeetForCTC.from_pretrained(self.checkpoint_name, torch_dtype=self.dtype, device_map=torch_device)
|
||||
model.eval()
|
||||
model.to(torch_device)
|
||||
|
||||
# -- apply
|
||||
inputs = self.processor(samples)
|
||||
inputs.to(torch_device, dtype=self.dtype)
|
||||
predicted_ids = model.generate(**inputs)
|
||||
torch.testing.assert_close(predicted_ids.cpu(), EXPECTED_TOKEN_IDS)
|
||||
predicted_transcripts = self.processor.batch_decode(predicted_ids, skip_special_tokens=True)
|
||||
self.assertListEqual(predicted_transcripts, EXPECTED_TRANSCRIPTIONS)
|
||||
@@ -0,0 +1,49 @@
|
||||
# Copyright 2025 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import shutil
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
from transformers import AutoProcessor, ParakeetProcessor
|
||||
from transformers.testing_utils import require_torch, require_torchaudio
|
||||
|
||||
from ...test_processing_common import ProcessorTesterMixin
|
||||
|
||||
|
||||
@require_torch
|
||||
@require_torchaudio
|
||||
class ParakeetProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
processor_class = ParakeetProcessor
|
||||
text_input_name = "labels"
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
cls.tmpdirname = tempfile.mkdtemp()
|
||||
cls.checkpoint = "nvidia/parakeet-ctc-1.1b"
|
||||
processor = ParakeetProcessor.from_pretrained(cls.checkpoint)
|
||||
processor.save_pretrained(cls.tmpdirname)
|
||||
|
||||
def get_tokenizer(self, **kwargs):
|
||||
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
|
||||
|
||||
def get_feature_extractor(self, **kwargs):
|
||||
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).feature_extractor
|
||||
|
||||
def get_processor(self, **kwargs):
|
||||
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs)
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
shutil.rmtree(cls.tmpdirname, ignore_errors=True)
|
||||
@@ -0,0 +1,53 @@
|
||||
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Testing suite for the ParakeetCTC tokenizer."""
|
||||
|
||||
import unittest
|
||||
|
||||
from transformers.models.parakeet import ParakeetTokenizerFast
|
||||
|
||||
from ...test_tokenization_common import TokenizerTesterMixin
|
||||
|
||||
|
||||
class ParakeetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
slow_tokenizer_class = None
|
||||
rust_tokenizer_class = ParakeetTokenizerFast
|
||||
tokenizer_class = ParakeetTokenizerFast
|
||||
test_slow_tokenizer = False
|
||||
test_rust_tokenizer = True
|
||||
from_pretrained_id = "nvidia/parakeet-ctc-1.1b"
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
super().setUpClass()
|
||||
tokenizer = ParakeetTokenizerFast.from_pretrained("nvidia/parakeet-ctc-1.1b")
|
||||
tokenizer.save_pretrained(cls.tmpdirname)
|
||||
|
||||
@unittest.skip(
|
||||
reason="This test does not apply to ParakeetTokenizerFast. More details in the test docstring itself."
|
||||
)
|
||||
def test_added_tokens_do_lower_case(self):
|
||||
"""
|
||||
Precompiled normalization from sentencepiece is `nmt_nfkc_cf` that includes lowercasing. Yet, ParakeetTokenizerFast does not have a do_lower_case attribute.
|
||||
This result in the test failing.
|
||||
"""
|
||||
pass
|
||||
|
||||
@unittest.skip(reason="This needs a slow tokenizer. Parakeet does not have one!")
|
||||
def test_encode_decode_with_spaces(self):
|
||||
return
|
||||
|
||||
@unittest.skip(reason="ParakeetTokenizerFast doesn't have tokenizer_file in its signature.")
|
||||
def test_rust_tokenizer_signature(self):
|
||||
pass
|
||||
Reference in New Issue
Block a user