This commit is contained in:
2025-10-09 16:47:16 +08:00
parent c8feb4deb5
commit e27e3f16bb
5248 changed files with 1778505 additions and 0 deletions

View File

@@ -0,0 +1,376 @@
# Copyright 2022 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import itertools
import os
import random
import tempfile
import unittest
import numpy as np
from datasets import load_dataset
from transformers import WhisperFeatureExtractor
from transformers.testing_utils import (
check_json_file_has_correct_format,
require_torch,
require_torch_accelerator,
)
from transformers.utils.import_utils import is_torch_available
from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
if is_torch_available():
import torch
global_rng = random.Random()
def floats_list(shape, scale=1.0, rng=None, name=None):
"""Creates a random float32 tensor"""
if rng is None:
rng = global_rng
values = []
for batch_idx in range(shape[0]):
values.append([])
for _ in range(shape[1]):
values[-1].append(rng.random() * scale)
return values
class WhisperFeatureExtractionTester:
def __init__(
self,
parent,
batch_size=7,
min_seq_length=400,
max_seq_length=2000,
feature_size=10,
hop_length=160,
chunk_length=8,
padding_value=0.0,
sampling_rate=4_000,
return_attention_mask=False,
do_normalize=True,
):
self.parent = parent
self.batch_size = batch_size
self.min_seq_length = min_seq_length
self.max_seq_length = max_seq_length
self.seq_length_diff = (self.max_seq_length - self.min_seq_length) // (self.batch_size - 1)
self.padding_value = padding_value
self.sampling_rate = sampling_rate
self.return_attention_mask = return_attention_mask
self.do_normalize = do_normalize
self.feature_size = feature_size
self.chunk_length = chunk_length
self.hop_length = hop_length
def prepare_feat_extract_dict(self):
return {
"feature_size": self.feature_size,
"hop_length": self.hop_length,
"chunk_length": self.chunk_length,
"padding_value": self.padding_value,
"sampling_rate": self.sampling_rate,
"return_attention_mask": self.return_attention_mask,
"do_normalize": self.do_normalize,
}
def prepare_inputs_for_common(self, equal_length=False, numpify=False):
def _flatten(list_of_lists):
return list(itertools.chain(*list_of_lists))
if equal_length:
speech_inputs = [floats_list((self.max_seq_length, self.feature_size)) for _ in range(self.batch_size)]
else:
# make sure that inputs increase in size
speech_inputs = [
floats_list((x, self.feature_size))
for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff)
]
if numpify:
speech_inputs = [np.asarray(x) for x in speech_inputs]
return speech_inputs
class WhisperFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
feature_extraction_class = WhisperFeatureExtractor
def setUp(self):
self.feat_extract_tester = WhisperFeatureExtractionTester(self)
def test_feat_extract_from_and_save_pretrained(self):
feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
with tempfile.TemporaryDirectory() as tmpdirname:
saved_file = feat_extract_first.save_pretrained(tmpdirname)[0]
check_json_file_has_correct_format(saved_file)
feat_extract_second = self.feature_extraction_class.from_pretrained(tmpdirname)
dict_first = feat_extract_first.to_dict()
dict_second = feat_extract_second.to_dict()
mel_1 = feat_extract_first.mel_filters
mel_2 = feat_extract_second.mel_filters
self.assertTrue(np.allclose(mel_1, mel_2))
self.assertEqual(dict_first, dict_second)
def test_feat_extract_to_json_file(self):
feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
with tempfile.TemporaryDirectory() as tmpdirname:
json_file_path = os.path.join(tmpdirname, "feat_extract.json")
feat_extract_first.to_json_file(json_file_path)
feat_extract_second = self.feature_extraction_class.from_json_file(json_file_path)
dict_first = feat_extract_first.to_dict()
dict_second = feat_extract_second.to_dict()
mel_1 = feat_extract_first.mel_filters
mel_2 = feat_extract_second.mel_filters
self.assertTrue(np.allclose(mel_1, mel_2))
self.assertEqual(dict_first, dict_second)
def test_feat_extract_from_pretrained_kwargs(self):
feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
with tempfile.TemporaryDirectory() as tmpdirname:
saved_file = feat_extract_first.save_pretrained(tmpdirname)[0]
check_json_file_has_correct_format(saved_file)
feat_extract_second = self.feature_extraction_class.from_pretrained(
tmpdirname, feature_size=2 * self.feat_extract_dict["feature_size"]
)
mel_1 = feat_extract_first.mel_filters
mel_2 = feat_extract_second.mel_filters
self.assertTrue(2 * mel_1.shape[1] == mel_2.shape[1])
def test_call(self):
# Tests that all call wrap to encode_plus and batch_encode_plus
feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
# create three inputs of length 800, 1000, and 1200
speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
# Test feature size
input_features = feature_extractor(np_speech_inputs, padding="max_length", return_tensors="np").input_features
self.assertTrue(input_features.ndim == 3)
self.assertTrue(input_features.shape[-1] == feature_extractor.nb_max_frames)
self.assertTrue(input_features.shape[-2] == feature_extractor.feature_size)
# Test not batched input
encoded_sequences_1 = feature_extractor(speech_inputs[0], return_tensors="np").input_features
encoded_sequences_2 = feature_extractor(np_speech_inputs[0], return_tensors="np").input_features
self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3))
# Test batched
encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features
encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
# Test 2-D numpy arrays are batched.
speech_inputs = [floats_list((1, x))[0] for x in (800, 800, 800)]
np_speech_inputs = np.asarray(speech_inputs)
encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features
encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
# Test truncation required
speech_inputs = [floats_list((1, x))[0] for x in range(200, (feature_extractor.n_samples + 500), 200)]
np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
speech_inputs_truncated = [x[: feature_extractor.n_samples] for x in speech_inputs]
np_speech_inputs_truncated = [np.asarray(speech_input) for speech_input in speech_inputs_truncated]
encoded_sequences_1 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
encoded_sequences_2 = feature_extractor(np_speech_inputs_truncated, return_tensors="np").input_features
for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
def test_dither(self):
np.random.seed(42) # seed the dithering randn()
# Tests that features with and without little dithering are similar, but not the same
dict_no_dither = self.feat_extract_tester.prepare_feat_extract_dict()
dict_no_dither["dither"] = 0.0
dict_dither = self.feat_extract_tester.prepare_feat_extract_dict()
dict_dither["dither"] = 0.00003 # approx. 1/32k
feature_extractor_no_dither = self.feature_extraction_class(**dict_no_dither)
feature_extractor_dither = self.feature_extraction_class(**dict_dither)
# create three inputs of length 800, 1000, and 1200
speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
# compute features
input_features_no_dither = feature_extractor_no_dither(
np_speech_inputs, padding=True, return_tensors="np", sampling_rate=dict_no_dither["sampling_rate"]
).input_features
input_features_dither = feature_extractor_dither(
np_speech_inputs, padding=True, return_tensors="np", sampling_rate=dict_dither["sampling_rate"]
).input_features
# test there is a difference between features (there's added noise to input signal)
diff = input_features_dither - input_features_no_dither
# features are not identical
self.assertTrue(np.abs(diff).mean() > 1e-6)
# features are not too different
self.assertTrue(np.abs(diff).mean() <= 1e-4)
self.assertTrue(np.abs(diff).max() <= 5e-3)
def test_feature_shape(self):
feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
hop_length = feature_extractor.hop_length
test_inputs = np.random.randn(16000)
self.assertTrue(
feature_extractor(
[test_inputs[: hop_length * 5 + 1]],
return_attention_mask=True,
padding=False,
return_tensors="np",
).attention_mask.shape[-1]
== 5
)
self.assertTrue(
feature_extractor(
[test_inputs[: hop_length * 5]],
return_attention_mask=True,
padding=False,
return_tensors="np",
).attention_mask.shape[-1]
== 5
)
self.assertTrue(
feature_extractor(
[test_inputs[: hop_length * 5 - 1]],
return_attention_mask=True,
padding=False,
return_tensors="np",
).attention_mask.shape[-1]
== 4
)
@require_torch
def test_double_precision_pad(self):
import torch
feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
np_speech_inputs = np.random.rand(100, 32).astype(np.float64)
py_speech_inputs = np_speech_inputs.tolist()
for inputs in [py_speech_inputs, np_speech_inputs]:
np_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="np")
self.assertTrue(np_processed.input_features.dtype == np.float32)
pt_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="pt")
self.assertTrue(pt_processed.input_features.dtype == torch.float32)
def _load_datasamples(self, num_samples):
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
# automatic decoding with librispeech
speech_samples = ds.sort("id")[:num_samples]["audio"]
return [x["array"] for x in speech_samples]
@require_torch_accelerator
@require_torch
def test_torch_integration(self):
# fmt: off
EXPECTED_INPUT_FEATURES = torch.tensor(
[
0.1193, -0.0946, -0.1098, -0.0196, 0.0225, -0.0690, -0.1736, 0.0951,
0.0971, -0.0817, -0.0702, 0.0162, 0.0260, 0.0017, -0.0192, -0.1678,
0.0709, -0.1867, -0.0655, -0.0274, -0.0234, -0.1884, -0.0516, -0.0554,
-0.0274, -0.1425, -0.1423, 0.0837, 0.0377, -0.0854
]
)
# fmt: on
input_speech = self._load_datasamples(1)
feature_extractor = WhisperFeatureExtractor()
input_features = feature_extractor(input_speech, return_tensors="pt").input_features
self.assertEqual(input_features.shape, (1, 80, 3000))
torch.testing.assert_close(input_features[0, 0, :30], EXPECTED_INPUT_FEATURES, rtol=1e-4, atol=1e-4)
@unittest.mock.patch("transformers.models.whisper.feature_extraction_whisper.is_torch_available", lambda: False)
def test_numpy_integration(self):
# fmt: off
EXPECTED_INPUT_FEATURES = np.array(
[
0.1193, -0.0946, -0.1098, -0.0196, 0.0225, -0.0690, -0.1736, 0.0951,
0.0971, -0.0817, -0.0702, 0.0162, 0.0260, 0.0017, -0.0192, -0.1678,
0.0709, -0.1867, -0.0655, -0.0274, -0.0234, -0.1884, -0.0516, -0.0554,
-0.0274, -0.1425, -0.1423, 0.0837, 0.0377, -0.0854
]
)
# fmt: on
input_speech = self._load_datasamples(1)
feature_extractor = WhisperFeatureExtractor()
input_features = feature_extractor(input_speech, return_tensors="np").input_features
self.assertEqual(input_features.shape, (1, 80, 3000))
self.assertTrue(np.allclose(input_features[0, 0, :30], EXPECTED_INPUT_FEATURES, atol=1e-4))
def test_zero_mean_unit_variance_normalization_trunc_np_longest(self):
feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
audio = self._load_datasamples(1)[0]
audio = ((audio - audio.min()) / (audio.max() - audio.min())) * 65535 # Rescale to [0, 65535] to show issue
audio = feat_extract.zero_mean_unit_var_norm([audio], attention_mask=None)[0]
self.assertTrue(np.all(np.mean(audio) < 1e-3))
self.assertTrue(np.all(np.abs(np.var(audio) - 1) < 1e-3))
@require_torch_accelerator
@require_torch
def test_torch_integration_batch(self):
# fmt: off
EXPECTED_INPUT_FEATURES = torch.tensor(
[
[
0.1193, -0.0946, -0.1098, -0.0196, 0.0225, -0.0690, -0.1736, 0.0951,
0.0971, -0.0817, -0.0702, 0.0162, 0.0260, 0.0017, -0.0192, -0.1678,
0.0709, -0.1867, -0.0655, -0.0274, -0.0234, -0.1884, -0.0516, -0.0554,
-0.0274, -0.1425, -0.1423, 0.0837, 0.0377, -0.0854
],
[
-0.4696, -0.0751, 0.0276, -0.0312, -0.0540, -0.0383, 0.1295, 0.0568,
-0.2071, -0.0548, 0.0389, -0.0316, -0.2346, -0.1068, -0.0322, 0.0475,
-0.1709, -0.0041, 0.0872, 0.0537, 0.0075, -0.0392, 0.0371, 0.0189,
-0.1522, -0.0270, 0.0744, 0.0738, -0.0245, -0.0667
],
[
-0.2337, -0.0060, -0.0063, -0.2353, -0.0431, 0.1102, -0.1492, -0.0292,
0.0787, -0.0608, 0.0143, 0.0582, 0.0072, 0.0101, -0.0444, -0.1701,
-0.0064, -0.0027, -0.0826, -0.0730, -0.0099, -0.0762, -0.0170, 0.0446,
-0.1153, 0.0960, -0.0361, 0.0652, 0.1207, 0.0277
]
]
)
# fmt: on
with torch.device("cuda"):
input_speech = self._load_datasamples(3)
feature_extractor = WhisperFeatureExtractor()
input_features = feature_extractor(input_speech, return_tensors="pt").input_features
self.assertEqual(input_features.shape, (3, 80, 3000))
torch.testing.assert_close(input_features[:, 0, :30], EXPECTED_INPUT_FEATURES, rtol=1e-4, atol=1e-4)

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,461 @@
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import shutil
import tempfile
import unittest
import numpy as np
import pytest
from transformers import WhisperTokenizer, WhisperTokenizerFast, is_speech_available
from transformers.testing_utils import require_sentencepiece, require_torch, require_torchaudio
from .test_feature_extraction_whisper import floats_list
if is_speech_available():
from transformers import WhisperFeatureExtractor, WhisperProcessor
TRANSCRIBE = 50358
NOTIMESTAMPS = 50362
@require_torch
@require_torchaudio
@require_sentencepiece
class WhisperProcessorTest(unittest.TestCase):
def setUp(self):
self.checkpoint = "openai/whisper-small.en"
self.tmpdirname = tempfile.mkdtemp()
def get_tokenizer(self, **kwargs):
return WhisperTokenizer.from_pretrained(self.checkpoint, **kwargs)
def get_feature_extractor(self, **kwargs):
return WhisperFeatureExtractor.from_pretrained(self.checkpoint, **kwargs)
def tearDown(self):
shutil.rmtree(self.tmpdirname)
def test_save_load_pretrained_default(self):
tokenizer = self.get_tokenizer()
feature_extractor = self.get_feature_extractor()
processor = WhisperProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
processor.save_pretrained(self.tmpdirname)
processor = WhisperProcessor.from_pretrained(self.tmpdirname)
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
self.assertIsInstance(processor.tokenizer, WhisperTokenizerFast)
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
self.assertIsInstance(processor.feature_extractor, WhisperFeatureExtractor)
def test_save_load_pretrained_additional_features(self):
processor = WhisperProcessor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor())
processor.save_pretrained(self.tmpdirname)
tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
feature_extractor_add_kwargs = self.get_feature_extractor(do_normalize=False, padding_value=1.0)
processor = WhisperProcessor.from_pretrained(
self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
)
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
self.assertIsInstance(processor.tokenizer, WhisperTokenizerFast)
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
self.assertIsInstance(processor.feature_extractor, WhisperFeatureExtractor)
def test_feature_extractor(self):
feature_extractor = self.get_feature_extractor()
tokenizer = self.get_tokenizer()
processor = WhisperProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
raw_speech = floats_list((3, 1000))
input_feat_extract = feature_extractor(raw_speech, return_tensors="np")
input_processor = processor(raw_speech, return_tensors="np")
for key in input_feat_extract:
self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
def test_tokenizer(self):
feature_extractor = self.get_feature_extractor()
tokenizer = self.get_tokenizer()
processor = WhisperProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
input_str = "This is a test string"
encoded_processor = processor(text=input_str)
encoded_tok = tokenizer(input_str)
for key in encoded_tok:
self.assertListEqual(encoded_tok[key], encoded_processor[key])
def test_tokenizer_decode(self):
feature_extractor = self.get_feature_extractor()
tokenizer = self.get_tokenizer()
processor = WhisperProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
decoded_processor = processor.batch_decode(predicted_ids)
decoded_tok = tokenizer.batch_decode(predicted_ids)
self.assertListEqual(decoded_tok, decoded_processor)
def test_get_decoder_prompt_ids(self):
feature_extractor = self.get_feature_extractor()
tokenizer = self.get_tokenizer()
processor = WhisperProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
forced_decoder_ids = processor.get_decoder_prompt_ids(task="transcribe", no_timestamps=True)
self.assertIsInstance(forced_decoder_ids, list)
for ids in forced_decoder_ids:
self.assertIsInstance(ids, (list, tuple))
expected_ids = [TRANSCRIBE, NOTIMESTAMPS]
self.assertListEqual([ids[-1] for ids in forced_decoder_ids], expected_ids)
def test_get_prompt_ids(self):
processor = WhisperProcessor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor())
prompt_ids = processor.get_prompt_ids("Mr. Quilter")
decoded_prompt = processor.tokenizer.decode(prompt_ids)
self.assertListEqual(prompt_ids.tolist(), [50360, 1770, 13, 2264, 346, 353])
self.assertEqual(decoded_prompt, "<|startofprev|> Mr. Quilter")
def test_empty_get_prompt_ids(self):
processor = WhisperProcessor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor())
prompt_ids = processor.get_prompt_ids("")
decoded_prompt = processor.tokenizer.decode(prompt_ids)
self.assertListEqual(prompt_ids.tolist(), [50360, 220])
self.assertEqual(decoded_prompt, "<|startofprev|> ")
def test_get_prompt_ids_with_special_tokens(self):
processor = WhisperProcessor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor())
def _test_prompt_error_raised_helper(prompt, special_token):
with pytest.raises(ValueError) as excinfo:
processor.get_prompt_ids(prompt)
expected = f"Encountered text in the prompt corresponding to disallowed special token: {special_token}."
self.assertEqual(expected, str(excinfo.value))
_test_prompt_error_raised_helper("<|startofprev|> test", "<|startofprev|>")
_test_prompt_error_raised_helper("test <|notimestamps|>", "<|notimestamps|>")
_test_prompt_error_raised_helper("test <|zh|> test <|transcribe|>", "<|zh|>")
def test_find_longest_common_subsequence_old(self):
"""Test using the old processing functions used in the ASR pipeline, but that serves as a BC reference."""
max_source_positions = 1500
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
previous_sequence = [[51492, 406, 3163, 1953, 466, 13, 51612, 51612]]
self.assertEqual(
processor.decode(previous_sequence[0], output_offsets=True),
{
"text": " not worth thinking about.",
"offsets": [{"text": " not worth thinking about.", "timestamp": (22.56, 24.96)}],
},
)
# Merge when the previous sequence is a suffix of the next sequence
# fmt: off
next_sequences_1 = [
[50364, 295, 6177, 3391, 11, 19817, 3337, 507, 307, 406, 3163, 1953, 466, 13, 50614, 50614, 2812, 9836, 14783, 390, 6263, 538, 257, 1359, 11, 8199, 6327, 1090, 322, 702, 7443, 13, 50834, 50257]
]
# fmt: on
self.assertEqual(
processor.decode(next_sequences_1[0], output_offsets=True),
{
"text": (
" of spectators, retrievality is not worth thinking about. His instant panic was followed by a"
" small, sharp blow high on his chest.<|endoftext|>"
),
"offsets": [
{"text": " of spectators, retrievality is not worth thinking about.", "timestamp": (0.0, 5.0)},
{
"text": " His instant panic was followed by a small, sharp blow high on his chest.",
"timestamp": (5.0, 9.4),
},
],
},
)
merge = _find_timestamp_sequence(
[[previous_sequence, (480_000, 0, 0)], [next_sequences_1, (480_000, 120_000, 0)]],
processor.tokenizer,
processor.feature_extractor,
max_source_positions,
)
# fmt: off
self.assertEqual(
merge,
[51492, 406, 3163, 1953, 466, 13, 51739, 51739, 2812, 9836, 14783, 390, 6263, 538, 257, 1359, 11, 8199, 6327, 1090, 322, 702, 7443, 13, 51959],
)
# fmt: on
self.assertEqual(
processor.decode(merge, output_offsets=True),
{
"text": (
" not worth thinking about. His instant panic was followed by a small, sharp blow high on his"
" chest."
),
"offsets": [
{"text": " not worth thinking about.", "timestamp": (22.56, 27.5)},
{
"text": " His instant panic was followed by a small, sharp blow high on his chest.",
"timestamp": (27.5, 31.900000000000002),
},
],
},
)
# Merge when the sequence is in the middle of the 1st next sequence
# fmt: off
next_sequences_2 = [
[50364, 295, 6177, 3391, 11, 19817, 3337, 507, 307, 406, 3163, 1953, 466, 13, 2812, 9836, 14783, 390, 6263, 538, 257, 1359, 11, 8199, 6327, 1090, 322, 702, 7443, 13, 50834, 50257]
]
# fmt: on
# {'text': ' of spectators, retrievality is not worth thinking about. His instant panic was followed by a small, sharp blow high on his chest.','timestamp': (0.0, 9.4)}
merge = _find_timestamp_sequence(
[[previous_sequence, (480_000, 0, 0)], [next_sequences_2, (480_000, 120_000, 0)]],
processor.tokenizer,
processor.feature_extractor,
max_source_positions,
)
# fmt: off
self.assertEqual(
merge,
[51492, 406, 3163, 1953, 466, 13, 2812, 9836, 14783, 390, 6263, 538, 257, 1359, 11, 8199, 6327, 1090, 322, 702, 7443, 13, 51959],
)
# fmt: on
self.assertEqual(
processor.decode(merge, output_offsets=True),
{
"text": (
" not worth thinking about. His instant panic was followed by a small, sharp blow high on his"
" chest."
),
"offsets": [
{
"text": (
" not worth thinking about. His instant panic was followed by a small, sharp blow high on"
" his chest."
),
"timestamp": (22.56, 31.900000000000002),
},
],
},
)
# Merge when the previous sequence is not included in the current sequence
next_sequences_3 = [[50364, 2812, 9836, 14783, 390, 6263, 538, 257, 1359, 11, 8199, 6327, 1090, 322, 702, 7443, 13, 50584, 50257]] # fmt: skip
# {'text': ' His instant panic was followed by a small, sharp blow high on his chest.','timestamp': (0.0, 9.4)}
merge = _find_timestamp_sequence(
[[previous_sequence, (480_000, 0, 0)], [next_sequences_3, (480_000, 120_000, 0)]],
processor.tokenizer,
processor.feature_extractor,
max_source_positions,
)
self.assertEqual(
merge,
[51492, 406, 3163, 1953, 466, 13, 51612, 51612, 2812, 9836, 14783, 390, 6263, 538, 257, 1359, 11, 8199, 6327, 1090, 322, 702, 7443, 13, 51832],
) # fmt: skip
self.assertEqual(
processor.decode(merge, output_offsets=True),
{
"text": (
" not worth thinking about. His instant panic was followed by a small, sharp blow high on his"
" chest."
),
"offsets": [
{"text": " not worth thinking about.", "timestamp": (22.56, 24.96)},
{
"text": " His instant panic was followed by a small, sharp blow high on his chest.",
"timestamp": (24.96, 29.36),
},
],
},
)
# last case is when the sequence is not in the first next predicted start and end of timestamp
next_sequences_3 = [
[50364, 2812, 9836, 14783, 390, 406, 3163, 1953, 466, 13, 50634, 50634, 2812, 9836, 14783, 390, 6263, 538, 257, 1359, 11, 8199, 6327, 1090, 322, 702, 7443, 13, 50934]
] # fmt: skip
merge = _find_timestamp_sequence(
[[previous_sequence, (480_000, 0, 0)], [next_sequences_3, (480_000, 167_000, 0)]],
processor.tokenizer,
processor.feature_extractor,
max_source_positions,
)
self.assertEqual(
merge,
[51492, 406, 3163, 1953, 466, 13, 51612, 51612, 2812, 9836, 14783, 390, 6263, 538, 257, 1359, 11, 8199, 6327, 1090, 322, 702, 7443, 13, 51912]
) # fmt: skip
self.assertEqual(
processor.decode(merge, output_offsets=True),
{
"text": (
" not worth thinking about. His instant panic was followed by a small, sharp blow high on his"
" chest."
),
"offsets": [
{"text": " not worth thinking about.", "timestamp": (22.56, 24.96)},
{
"text": " His instant panic was followed by a small, sharp blow high on his chest.",
"timestamp": (24.96, 30.96),
},
],
},
)
def _fast_find_longest_common_sequence(sequence_left, sequence_right):
"""Old processing function used in the ASR pipeline."""
seq_len_left = len(sequence_left)
seq_len_right = len(sequence_right)
counter = [[0] * (seq_len_right + 1) for _ in range(seq_len_left + 1)]
longest = 0
for i in range(seq_len_left):
for j in range(seq_len_right):
if sequence_left[i] == sequence_right[j]:
previous_counter = counter[i][j] + 1
counter[i + 1][j + 1] = previous_counter
if previous_counter > longest:
longest = previous_counter
counter = np.array(counter)
# we return the idx of the first element of the longest common sequence in the left sequence
index_left = np.argwhere(counter == longest)[-1][0] - longest if longest != 0 else -1
index_right = np.argwhere(counter == longest)[-1][1] - longest if longest != 0 else -1
return index_left, index_right, longest
def _find_timestamp_sequence(sequences, tokenizer, feature_extractor, max_source_positions):
"""
Old processing function used in the ASR pipeline.
Computes the final sequences by merging the end of the nth sequence with the beginning of the n+1th sequence. Since
`WhisperForConditionalGeneration` produces the timestamps pairwise, we filter the consecutive timestamps and only
iterate over them. We keep track of the `time` which indicates the actual starting time of the chunk that is
processed. We need to make sure to offset the timestamps tokens by the `time` in order for the tokenizer to
properly compute the final `offset`.
"""
# index of the first timestamp token
timestamp_begin = tokenizer.convert_tokens_to_ids("<|notimestamps|>") + 1
items = []
# approximation of the token to time ratio : ~0.2seconds
time_precision = feature_extractor.chunk_length / max_source_positions
time = 0
for seq_idx, item in enumerate(sequences):
sequence, stride = item
if isinstance(sequence, list):
sequence = np.array(sequence)
chunk_len, stride_left, stride_right = stride
sequence = sequence.squeeze(0)
# get rid of the `forced_decoder_idx` that are use to parametrize the generation
begin_idx = np.where(sequence == timestamp_begin)[0][0] if timestamp_begin in sequence else 0
sequence = sequence[begin_idx:]
timestamp_tokens = sequence >= timestamp_begin
if seq_idx != 0 and sum(timestamp_tokens) > 0:
consecutive = np.where(timestamp_tokens[:-1] & timestamp_tokens[1:])[0] + 1
last_timestamp = np.where(timestamp_tokens)[0][-1]
consecutive = np.append(consecutive, last_timestamp) if last_timestamp not in consecutive else consecutive
time -= stride_left + stride_right
offset = int((time / feature_extractor.sampling_rate) / time_precision)
overlap_time = int((stride_left / feature_extractor.sampling_rate) / time_precision)
# relevant timestamps are in the overlapping part
relevant_timestamp = np.where(sequence[consecutive] >= timestamp_begin + overlap_time)[0]
if relevant_timestamp.shape[0] > 0:
relevant_timestamp = (
consecutive[relevant_timestamp[0] - 1] if relevant_timestamp[0] > 0 else consecutive[0]
)
# if a big stride is used, we need to check some of the previous items for the best overlap
best_match = 0
sliced_sequence = []
for idx, previous_sequence in enumerate(reversed(items)):
previous_tokens = previous_sequence[1:-1]
if previous_sequence[0] < (timestamp_begin + offset - overlap_time) and idx != 0:
break # the previous sequence is too far in the past
if len(previous_tokens) > 0:
# find the longest common sequence between the overlapping parts
index_left, index_right, match_length = _fast_find_longest_common_sequence(
sequence[1:relevant_timestamp], previous_tokens
)
# don't do anything if only 1 token was matched
if match_length > 1 and match_length > best_match:
best_match = match_length
best_idx = idx
end_of_curr_sequence_idx = (
np.where(sequence[index_left + 1 :] >= timestamp_begin)[0][0] + 1
)
end_of_curr_sequence_idx = end_of_curr_sequence_idx + 1 + index_left
# if all the tokens are matched, suffix
if index_left == 0 and match_length == len(previous_tokens):
sliced_sequence = np.insert(
sequence[index_left + 1 : end_of_curr_sequence_idx], 0, previous_sequence[0]
)
sliced_sequence[-1] = previous_sequence[-1]
# if part of the previous sequence is not taken
elif index_left >= 0:
sliced_sequence = sequence[index_left + 1 : end_of_curr_sequence_idx]
# let's insert the missing part of the previous sequence
previous_slice = (
previous_sequence[: index_right + 1] if index_right > 0 else [previous_sequence[0]]
)
sliced_sequence = np.insert(sliced_sequence, 0, previous_slice)
sliced_sequence[-1] += offset
if len(sliced_sequence) > 0:
items[len(items) - best_idx - 1] = sliced_sequence
items = items[: len(items) - best_idx]
sequence = sequence[end_of_curr_sequence_idx:]
# sequence might have changed
timestamp_tokens = sequence >= timestamp_begin
consecutive = np.where(timestamp_tokens[:-1] & timestamp_tokens[1:])[0] + 1
if sum(timestamp_tokens) > 0:
last_timestamp = np.where(timestamp_tokens)[0][-1]
consecutive = (
np.append(consecutive, last_timestamp + 1) if last_timestamp not in consecutive else consecutive
)
if len(consecutive) > 0:
last_slice = 0
for current_slice in consecutive:
actual_offset = items[-1][-1] if seq_idx != 0 or last_slice != 0 else sequence[0]
sliced_tokens = sequence[last_slice:current_slice]
duration = sliced_tokens[-1] - sliced_tokens[0]
sliced_tokens[0] = actual_offset
sliced_tokens[-1] = actual_offset + duration
items.append(sliced_tokens)
last_slice = current_slice
time += chunk_len
result = []
for i in range(len(items)):
result += items[i].tolist()
return result

View File

@@ -0,0 +1,590 @@
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from transformers.models.whisper import WhisperTokenizer, WhisperTokenizerFast
from transformers.models.whisper.tokenization_whisper import _combine_tokens_into_words, _find_longest_common_sequence
from transformers.testing_utils import require_torch, slow
from ...test_tokenization_common import TokenizerTesterMixin
ES_CODE = 50262
EN_CODE = 50259
END_OF_TRANSCRIPT = 50257
START_OF_TRANSCRIPT = 50258
TRANSLATE = 50358
TRANSCRIBE = 50359
NOTIMESTAMPS = 50363
class WhisperTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "openai/whisper-tiny"
tokenizer_class = WhisperTokenizer
rust_tokenizer_class = WhisperTokenizerFast
test_rust_tokenizer = True
test_sentencepiece = False
test_seq2seq = False
@classmethod
def setUpClass(cls):
super().setUpClass()
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny")
tokenizer.pad_token_id = 50256
tokenizer.pad_token = "<|endoftext|>"
tokenizer.save_pretrained(cls.tmpdirname)
def test_convert_token_and_id(self):
"""Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
token = "Where"
token_id = 14436
self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id)
self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token)
def test_get_vocab(self):
vocab_keys = list(self.get_tokenizer().get_vocab().keys())
self.assertEqual(vocab_keys[0], "!")
self.assertEqual(vocab_keys[1], '"')
self.assertEqual(vocab_keys[-1], "<|30.00|>")
self.assertEqual(len(vocab_keys), 51865)
def test_vocab_size(self):
self.assertEqual(self.get_tokenizer().vocab_size, 50258)
def test_full_tokenizer(self):
tokenizer = WhisperTokenizer.from_pretrained(self.tmpdirname)
tokens = tokenizer.tokenize("This is a test")
self.assertListEqual(tokens, ["This", "Ġis", "Ġa", "Ġtest"])
self.assertListEqual(
tokenizer.convert_tokens_to_ids(tokens),
[5723, 307, 257, 1500],
)
tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
self.assertListEqual(
tokens,
["I", "Ġwas", "Ġborn", "Ġin", "Ġ9", "2000", ",", "Ġand", "Ġthis", "Ġis", "Ġfals", "é", "."], # fmt: skip
)
ids = tokenizer.convert_tokens_to_ids(tokens)
self.assertListEqual(ids, [40, 390, 4232, 294, 1722, 25743, 11, 293, 341, 307, 16720, 526, 13])
back_tokens = tokenizer.convert_ids_to_tokens(ids)
self.assertListEqual(
back_tokens,
["I", "Ġwas", "Ġborn", "Ġin", "Ġ9", "2000", ",", "Ġand", "Ġthis", "Ġis", "Ġfals", "é", "."], # fmt: skip
)
@unittest.skip
def test_tokenizer_slow_store_full_signature(self):
pass
@unittest.skip
def test_tokenizer_fast_store_full_signature(self):
pass
@unittest.skip
def test_special_tokens_initialization(self):
# Whisper relies on specific additional special tokens, so we skip this
# general test. In particular, this test loads fast tokenizer from slow
# tokenizer, and the conversion uses prefix_tokens, where we reference
# additional special tokens by specific indices, hence overriding the
# list with less tokens leads to out of index error
pass
@slow
def test_tokenizer_integration(self):
expected_encoding = {'input_ids': [[50257, 50362, 41762, 364, 357, 36234, 1900, 355, 12972, 13165, 354, 12, 35636, 364, 290, 12972, 13165, 354, 12, 5310, 13363, 12, 4835, 8, 3769, 2276, 12, 29983, 45619, 357, 13246, 51, 11, 402, 11571, 12, 17, 11, 5564, 13246, 38586, 11, 16276, 44, 11, 4307, 346, 33, 861, 11, 16276, 7934, 23029, 329, 12068, 15417, 28491, 357, 32572, 52, 8, 290, 12068, 15417, 16588, 357, 32572, 38, 8, 351, 625, 3933, 10, 2181, 13363, 4981, 287, 1802, 10, 8950, 290, 2769, 48817, 1799, 1022, 449, 897, 11, 9485, 15884, 354, 290, 309, 22854, 37535, 13, 50256], [50257, 50362, 13246, 51, 318, 3562, 284, 662, 12, 27432, 2769, 8406, 4154, 282, 24612, 422, 9642, 9608, 276, 2420, 416, 26913, 21143, 319, 1111, 1364, 290, 826, 4732, 287, 477, 11685, 13, 50256], [50257, 50362, 464, 2068, 7586, 21831, 18045, 625, 262, 16931, 3290, 13, 50256]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]} # fmt: skip
self.tokenizer_integration_test_util(
expected_encoding=expected_encoding, model_name="openai/whisper-tiny.en", padding=False
)
def test_output_offsets(self):
tokenizer = self.get_tokenizer()
previous_sequence = [51492, 406, 3163, 1953, 466, 13, 51612, 51612]
self.assertEqual(
tokenizer.decode(previous_sequence, output_offsets=True),
{
"text": " not worth thinking about.",
"offsets": [{"text": " not worth thinking about.", "timestamp": (22.56, 24.96)}],
},
)
# Merge when the previous sequence is a suffix of the next sequence
next_sequences_1 = [50364, 295, 6177, 3391, 11, 19817, 3337, 507, 307, 406, 3163, 1953, 466, 13, 50614, 50614, 2812, 9836, 14783, 390, 6263, 538, 257, 1359, 11, 8199, 6327, 1090, 322, 702, 7443, 13, 50834, 50257] # fmt: skip
self.assertEqual(
tokenizer.decode(next_sequences_1, output_offsets=True),
{
"text": (
" of spectators, retrievality is not worth thinking about. His instant panic was followed by a"
" small, sharp blow high on his chest.<|endoftext|>"
),
"offsets": [
{"text": " of spectators, retrievality is not worth thinking about.", "timestamp": (0.0, 5.0)},
{
"text": " His instant panic was followed by a small, sharp blow high on his chest.",
"timestamp": (5.0, 9.4),
},
],
},
)
def test_find_longest_common_subsequence(self):
previous_sequence = [1, 2, 3]
next_sequence = [2, 3, 4, 5]
merge = _find_longest_common_sequence([previous_sequence, next_sequence])
self.assertEqual(merge, [1, 2, 3, 4, 5])
# Now previous is larger than next.
# We merge what we can and remove the extra right side of the left sequence
previous_sequence = [1, 2, 3, 4, 5, 6, 7]
next_sequence = [2, 3, 4, 5]
merge = _find_longest_common_sequence([previous_sequence, next_sequence])
self.assertEqual(merge, [1, 2, 3, 4, 5])
# Nothing in common
previous_sequence = [1, 2, 3]
next_sequence = [4, 5, 6]
merge = _find_longest_common_sequence([previous_sequence, next_sequence])
self.assertEqual(merge, [1, 2, 3, 4, 5, 6])
# Some errors in the overlap.
# We take from previous on the left, from the next on the right of the overlap
previous_sequence = [1, 2, 3, 4, 99]
next_sequence = [2, 98, 4, 5, 6]
merge = _find_longest_common_sequence([previous_sequence, next_sequence])
self.assertEqual(merge, [1, 2, 3, 4, 5, 6])
# We take from previous on the left, from the next on the right of the overlap
previous_sequence = [1, 2, 99, 4, 5]
next_sequence = [2, 3, 4, 98, 6]
merge = _find_longest_common_sequence([previous_sequence, next_sequence])
self.assertEqual(merge, [1, 2, 99, 4, 98, 6])
# This works on 3 sequences
seq1 = [1, 2, 3]
seq2 = [2, 3, 4]
seq3 = [3, 4, 5]
merge = _find_longest_common_sequence([seq1, seq2, seq3])
self.assertEqual(merge, [1, 2, 3, 4, 5])
# This works on 3 sequences with errors
seq1 = [1, 2, 3, 98, 5]
seq2 = [2, 99, 4, 5, 6, 7]
seq3 = [4, 97, 6, 7, 8]
merge = _find_longest_common_sequence([seq1, seq2, seq3])
self.assertEqual(merge, [1, 2, 3, 4, 5, 6, 7, 8])
def test_skip_special_tokens_skips_prompt_ids(self):
tokenizer = self.get_tokenizer()
rust_tokenizer = self.get_rust_tokenizer()
# fmt: off
encoded_input = [
50361, 2221, 13, 2326, 388, 391, 50258, 50259, 50359,
50363, 1282, 264, 2674, 9156, 295, 1523, 11, 2221, 13,
2326, 388, 391, 13657, 365, 2681, 21296, 17711, 13, 50257,
]
# fmt: on
expected_with_special_tokens = "<|startofprev|> Mr. Quilter<|startoftranscript|><|en|><|transcribe|><|notimestamps|> On the general principles of art, Mr. Quilter writes with equal lucidity.<|endoftext|>"
expected_without_special_tokens = " On the general principles of art, Mr. Quilter writes with equal lucidity."
self.assertEqual(tokenizer.decode(encoded_input, skip_special_tokens=False), expected_with_special_tokens)
self.assertEqual(tokenizer.decode(encoded_input, skip_special_tokens=True), expected_without_special_tokens)
self.assertEqual(rust_tokenizer.decode(encoded_input, skip_special_tokens=False), expected_with_special_tokens)
self.assertEqual(
rust_tokenizer.decode(encoded_input, skip_special_tokens=True), expected_without_special_tokens
)
def test_skip_special_tokens_with_timestamps(self):
tokenizer = self.get_tokenizer()
rust_tokenizer = self.get_rust_tokenizer()
# fmt: off
encoded_input = [
50258, 50363, 50364, 634, 575, 12525, 22618, 1968, 6144,
35617, 20084, 1756, 311, 589, 307, 534, 10281, 934,
439, 293, 50676, 50676, 393, 4411, 294, 309, 457,
707, 295, 33301, 286, 392, 6628, 13, 50836, 50257,
]
# fmt: on
expected_with_special_tokens = "<|startoftranscript|><|notimestamps|><|0.00|> He has grave doubts whether Sir Frederick Layton's work is really Greek after all and<|6.24|><|6.24|> can discover in it but little of rocky Ithaca.<|9.44|><|endoftext|>"
expected_without_special_tokens = "<|0.00|> He has grave doubts whether Sir Frederick Layton's work is really Greek after all and<|6.24|><|6.24|> can discover in it but little of rocky Ithaca.<|9.44|>"
self.assertEqual(
tokenizer.decode(encoded_input, decode_with_timestamps=True, skip_special_tokens=False),
expected_with_special_tokens,
)
self.assertEqual(
tokenizer.decode(encoded_input, decode_with_timestamps=True, skip_special_tokens=True),
expected_without_special_tokens,
)
self.assertEqual(
rust_tokenizer.decode(encoded_input, decode_with_timestamps=True, skip_special_tokens=False),
expected_with_special_tokens,
)
self.assertEqual(
rust_tokenizer.decode(encoded_input, decode_with_timestamps=True, skip_special_tokens=True),
expected_without_special_tokens,
)
def test_fast_tokenizer_get_prompt_ids(self):
tokenizer = self.get_tokenizer()
rust_tokenizer = self.get_rust_tokenizer()
prompt = "This is test prompt text."
tokenizer_prompt_ids = tokenizer.get_prompt_ids(prompt)
fast_tokenizer_prompt_ids = rust_tokenizer.get_prompt_ids(prompt)
self.assertListEqual(tokenizer_prompt_ids.tolist(), fast_tokenizer_prompt_ids.tolist())
def test_tokenizer_decode_prompt(self):
prompt_text = "What does the fox say?"
input_text = "Hatee hatee hatee ho"
tokenizer = self.get_tokenizer()
rust_tokenizer = self.get_rust_tokenizer()
# encode prompt and input text using tokenizer
prompt_ids = tokenizer.get_prompt_ids(prompt_text, return_tensors="np")
input_ids = tokenizer(input_text, return_tensors="np").input_ids[0]
input_ids = np.hstack([prompt_ids, input_ids])
# encode using fast tokenizer
rust_prompt_ids = rust_tokenizer.get_prompt_ids(prompt_text, return_tensors="np")
rust_input_ids = rust_tokenizer(input_text, return_tensors="np").input_ids[0]
rust_input_ids = np.hstack([rust_prompt_ids, rust_input_ids])
# check with prompt in output
pred_text = tokenizer.decode(input_ids, skip_special_tokens=False)
rust_pred_text = rust_tokenizer.decode(rust_input_ids, skip_special_tokens=False)
# check correctness for both tokenizers
expected_text = f"<|startofprev|> {prompt_text}<|startoftranscript|><|notimestamps|>{input_text}<|endoftext|>"
self.assertEqual(pred_text.strip(), expected_text)
self.assertEqual(rust_pred_text.strip(), expected_text)
# check stripping prompt from output
pred_text = tokenizer.decode(input_ids, skip_special_tokens=True)
rust_pred_text = tokenizer.decode(input_ids, skip_special_tokens=True)
self.assertEqual(pred_text.strip(), input_text)
self.assertEqual(rust_pred_text.strip(), input_text)
def test_combine_tokens_into_words(self):
tokenizer = self.get_tokenizer()
rust_tokenizer = self.get_rust_tokenizer()
# 'whatever "whatever" said someone, clever!?'
encoded_input = [1363, 7969, 503, 1363, 7969, 1, 848, 1580, 11, 13494, 7323]
expected_words = ["whatever", ' "whatever"', " said", " someone,", " clever!?"]
expected_tokens = [[1363, 7969], [503, 1363, 7969, 1], [848], [1580, 11], [13494, 7323]]
expected_indices = [[0, 1], [2, 3, 4, 5], [6], [7, 8], [9, 10]]
output = _combine_tokens_into_words(tokenizer, encoded_input)
self.assertEqual(expected_words, output[0])
self.assertEqual(expected_tokens, output[1])
self.assertEqual(expected_indices, output[2])
output_rust = _combine_tokens_into_words(rust_tokenizer, encoded_input)
self.assertEqual(expected_words, output_rust[0])
self.assertEqual(expected_tokens, output_rust[1])
self.assertEqual(expected_indices, output_rust[2])
def test_basic_normalizer(self):
tokenizer = self.get_tokenizer()
rust_tokenizer = self.get_rust_tokenizer()
input_str = "Hola güey!"
expected_output_normalize = "hola güey "
expected_output_diacritics = "hola guey "
# tokenizer tests
encoded_input = tokenizer(input_str).input_ids
decoded_output = tokenizer.decode(encoded_input, skip_special_tokens=True, basic_normalize=False)
self.assertEqual(decoded_output, input_str)
decoded_output_normalize = tokenizer.decode(encoded_input, skip_special_tokens=True, basic_normalize=True)
self.assertEqual(decoded_output_normalize, expected_output_normalize)
decoded_output_diacritics = tokenizer.decode(
encoded_input, skip_special_tokens=True, basic_normalize=True, remove_diacritics=True
)
self.assertEqual(decoded_output_diacritics, expected_output_diacritics)
# fast tokenizer tests
encoded_input = rust_tokenizer(input_str).input_ids
decoded_output = rust_tokenizer.decode(encoded_input, skip_special_tokens=True, basic_normalize=False)
self.assertEqual(decoded_output, input_str)
decoded_output_normalize = rust_tokenizer.decode(encoded_input, skip_special_tokens=True, basic_normalize=True)
self.assertEqual(decoded_output_normalize, expected_output_normalize)
decoded_output_diacritics = rust_tokenizer.decode(
encoded_input, skip_special_tokens=True, basic_normalize=True, remove_diacritics=True
)
self.assertEqual(decoded_output_diacritics, expected_output_diacritics)
def test_decode_asr_with_word_level_timestamps(self):
# fmt: off
model_outputs = [
{
'stride': [10, 0, 5],
'tokens': np.array([[50363, 3363, 11, 345, 460, 0, 50423]]),
'token_timestamps': np.array([[0.0, 0.5, 0.52, 0.78, 1.2, 1.28, 1.28]])
}
]
# fmt: on
tokenizer = WhisperTokenizer.from_pretrained("onnx-community/whisper-tiny.en_timestamped")
result = tokenizer._decode_asr(
model_outputs, return_timestamps="word", return_language=False, time_precision=0.02
)
EXPECTED_OUTPUT = (
" Yes, you can!",
{
"chunks": [
{"text": " Yes,", "timestamp": (0.0, 0.52)},
{"text": " you", "timestamp": (0.52, 0.78)},
{"text": " can!", "timestamp": (0.78, 1.28)},
]
},
)
self.assertEqual(result, EXPECTED_OUTPUT)
class SpeechToTextTokenizerMultilinguialTest(unittest.TestCase):
checkpoint_name = "openai/whisper-small.en"
@classmethod
def setUpClass(cls):
cls.tokenizer: WhisperTokenizer = WhisperTokenizer.from_pretrained(cls.checkpoint_name)
return cls
def test_tokenizer_equivalence(self):
text = "다람쥐 헌 쳇바퀴에 타고파"
multilingual_tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny", language="korean")
monolingual_tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny.en")
monolingual_tokens = monolingual_tokenizer.encode(text, add_special_tokens=False)
multilingual_tokens = multilingual_tokenizer.encode(text, add_special_tokens=False)
assert monolingual_tokenizer.decode(monolingual_tokens) == text
assert multilingual_tokenizer.decode(multilingual_tokens) == text
assert len(monolingual_tokens) > len(multilingual_tokens)
# fmt: off
EXPECTED_ENG = [
46695, 97, 167, 252, 234, 168, 98, 238, 220, 169,
245, 234, 23821, 111, 229, 167, 108, 242, 169, 222,
112, 168, 245, 238, 220, 169, 225, 222, 166, 111,
254, 169, 234, 234
]
EXPECTED_MULTI = [
9835, 22855, 168, 98, 238, 13431, 234, 43517, 229, 47053,
169, 222, 19086, 19840, 1313, 17974
]
# fmt: on
self.assertListEqual(monolingual_tokens, EXPECTED_ENG)
self.assertListEqual(multilingual_tokens, EXPECTED_MULTI)
def test_tokenizer_special(self):
multilingual_tokenizer = WhisperTokenizer.from_pretrained(
"openai/whisper-tiny", language="english", task="transcribe"
)
text = "Hey! How are you feeling? J'ai l'impression que 郷さん est prêt"
multilingual_tokens = multilingual_tokenizer.encode(text)
# fmt: off
# format: <|startoftranscript|> <|lang-id|> <|task|> <|notimestamps|> ... transcription ids ... <|endoftext|>
EXPECTED_MULTI = [
START_OF_TRANSCRIPT, EN_CODE, TRANSCRIBE, NOTIMESTAMPS, 7057, 0, 1012, 366, 291,
2633, 30, 508, 6, 1301, 287, 6, 36107, 631, 220, 11178,
115, 15567, 871, 44393, END_OF_TRANSCRIPT
]
EXPECTED_SPECIAL_TEXT = (
"<|startoftranscript|><|en|><|transcribe|><|notimestamps|>Hey! How are you feeling? "
"J'ai l'impression que 郷さん est prêt<|endoftext|>"
)
# fmt: on
self.assertListEqual(multilingual_tokens, EXPECTED_MULTI)
special_transcript = multilingual_tokenizer.decode(multilingual_tokens, skip_special_tokens=False)
self.assertEqual(special_transcript, EXPECTED_SPECIAL_TEXT)
transcript = multilingual_tokenizer.decode(multilingual_tokens, skip_special_tokens=True)
self.assertEqual(transcript, text)
def test_vocab_size(self):
self.assertEqual(self.tokenizer.vocab_size, 50257)
# Copied from tests.models.speech_to_text.test_tokenization_speech_to_text.SpeechToTextTokenizerMultilinguialTest.test_tokenizer_decode_ignores_language_codes
def test_tokenizer_decode_ignores_language_codes(self):
self.assertIn(ES_CODE, self.tokenizer.all_special_ids)
generated_ids = [ES_CODE, 4, 1601, 47, 7647, 2]
result = self.tokenizer.decode(generated_ids, skip_special_tokens=True)
expected_spanish = self.tokenizer.decode(generated_ids[1:], skip_special_tokens=True)
self.assertEqual(result, expected_spanish)
self.assertNotIn(self.tokenizer.eos_token, result)
def test_batch_encoding(self):
multilingual_tokenizer = WhisperTokenizer.from_pretrained(
"openai/whisper-tiny", language="spanish", task="translate"
)
batch = ["El gato ", "El gato se sentó"]
batch_output = multilingual_tokenizer.batch_encode_plus(batch, padding=True).input_ids
# fmt: off
EXPECTED_MULTI = [
[START_OF_TRANSCRIPT, ES_CODE, TRANSLATE, NOTIMESTAMPS, 17356, 290, 2513, 220,
END_OF_TRANSCRIPT, END_OF_TRANSCRIPT, END_OF_TRANSCRIPT],
[START_OF_TRANSCRIPT, ES_CODE, TRANSLATE, NOTIMESTAMPS, 17356, 290, 2513, 369,
2279, 812, END_OF_TRANSCRIPT]
]
# fmt: on
self.assertListEqual(batch_output, EXPECTED_MULTI)
def test_set_prefix_tokens(self):
multilingual_tokenizer = WhisperTokenizer.from_pretrained(
"openai/whisper-tiny", language="spanish", task="translate"
)
# change the language prefix token from Spanish to English
multilingual_tokenizer.set_prefix_tokens(language="english")
batch = ["the cat", "the cat sat"]
batch_output = multilingual_tokenizer.batch_encode_plus(batch, padding=True).input_ids
# fmt: off
EXPECTED_MULTI = [
[START_OF_TRANSCRIPT, EN_CODE, TRANSLATE, NOTIMESTAMPS, 3322, 3857,
END_OF_TRANSCRIPT, END_OF_TRANSCRIPT],
[START_OF_TRANSCRIPT, EN_CODE, TRANSLATE, NOTIMESTAMPS, 3322, 3857,
3227, END_OF_TRANSCRIPT]
]
# fmt: on
self.assertListEqual(batch_output, EXPECTED_MULTI)
def test_batch_encoding_decoding(self):
multilingual_tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny", language="spanish")
batch = ["hola güey", "que onda"]
batch_encoding = multilingual_tokenizer.batch_encode_plus(batch, padding=True).input_ids
transcription = multilingual_tokenizer.batch_decode(batch_encoding, skip_special_tokens=True)
self.assertListEqual(batch, transcription)
def test_offset_decoding(self):
multilingual_tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny")
# fmt: off
INPUT_TOKENS = [
50258, 50259, 50359, 50364, 441, 1857, 4174, 11, 5242, 366,
257, 1333, 295, 493, 2794, 2287, 293, 12018, 14880, 11,
293, 25730, 311, 454, 34152, 4496, 904, 50724, 50724, 366,
382, 4048, 382, 257, 361, 18459, 13065, 13, 2221, 13,
7145, 74, 325, 38756, 311, 29822, 7563, 412, 472, 709,
294, 264, 51122, 51122, 912, 636, 300, 2221, 13, 2741,
5767, 1143, 281, 7319, 702, 7798, 13, 400, 2221, 13,
2619, 4004, 811, 2709, 702, 51449, 51449, 50257
]
# fmt: on
output = multilingual_tokenizer.decode(INPUT_TOKENS, output_offsets=True)["offsets"]
self.assertEqual(
output,
[
{
"text": (
" Lennils, pictures are a sort of upguards and atom paintings, and Mason's exquisite idles"
),
"timestamp": (0.0, 7.2),
},
{
"text": (
" are as national as a jingo poem. Mr. Birkut Foster's landscapes smile at one much in the"
),
"timestamp": (7.2, 15.16),
},
{
"text": " same way that Mr. Carker used to flash his teeth. And Mr. John Colier gives his",
"timestamp": (15.16, 21.7),
},
],
)
# test `decode_with_offsets`
output = multilingual_tokenizer.decode(INPUT_TOKENS, decode_with_timestamps=True)
self.assertEqual(
output,
"<|startoftranscript|><|en|><|transcribe|><|0.00|> Lennils, pictures are a sort of upguards and atom"
" paintings, and Mason's exquisite idles<|7.20|><|7.20|> are as national as a jingo poem. Mr. Birkut"
" Foster's landscapes smile at one much in the<|15.16|><|15.16|> same way that Mr. Carker used to flash"
" his teeth. And Mr. John Colier gives his<|21.70|><|21.70|><|endoftext|>",
)
# test a single sequence with timestamps
# fmt: off
INPUT_TOKENS = [
50364, 441, 1857, 4174, 11, 5242, 366,
257, 1333, 295, 493, 2794, 2287, 293, 12018, 14880, 11,
293, 25730, 311, 454, 34152, 4496, 904, 50724
]
# fmt: on
output = multilingual_tokenizer.decode(INPUT_TOKENS, output_offsets=True)["offsets"]
self.assertEqual(
output[0],
{
"text": " Lennils, pictures are a sort of upguards and atom paintings, and Mason's exquisite idles",
"timestamp": (0.0, 7.2),
},
)
# test a sequence without a single timestamps
# fmt: off
INPUT_TOKENS = [
441, 1857, 4174, 11, 5242, 366,
257, 1333, 295, 493, 2794, 2287, 293, 12018, 14880, 11,
293, 25730, 311, 454, 34152, 4496, 904, 50724
]
# fmt: on
output = multilingual_tokenizer.decode(INPUT_TOKENS, output_offsets=True)["offsets"]
self.assertEqual(output, [])
def test_convert_to_list_np(self):
test_list = [[1, 2, 3], [4, 5, 6]]
# Test with an already converted list
self.assertListEqual(WhisperTokenizer._convert_to_list(test_list), test_list)
self.assertListEqual(WhisperTokenizerFast._convert_to_list(test_list), test_list)
# Test with a numpy array
np_array = np.array(test_list)
self.assertListEqual(WhisperTokenizer._convert_to_list(np_array), test_list)
self.assertListEqual(WhisperTokenizerFast._convert_to_list(np_array), test_list)
@require_torch
def test_convert_to_list_pt(self):
import torch
test_list = [[1, 2, 3], [4, 5, 6]]
torch_tensor = torch.tensor(test_list)
self.assertListEqual(WhisperTokenizer._convert_to_list(torch_tensor), test_list)
self.assertListEqual(WhisperTokenizerFast._convert_to_list(torch_tensor), test_list)