init

2025-10-09 16:47:16 +08:00
parent c8feb4deb5
commit e27e3f16bb
5248 changed files with 1778505 additions and 0 deletions
--- a/transformers/tests/models/whisper/init.py
+++ b/transformers/tests/models/whisper/init.py
--- a/transformers/tests/models/whisper/test_feature_extraction_whisper.py
+++ b/transformers/tests/models/whisper/test_feature_extraction_whisper.py
@@ -0,0 +1,376 @@
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import itertools
+import os
+import random
+import tempfile
+import unittest
+
+import numpy as np
+from datasets import load_dataset
+
+from transformers import WhisperFeatureExtractor
+from transformers.testing_utils import (
+    check_json_file_has_correct_format,
+    require_torch,
+    require_torch_accelerator,
+)
+from transformers.utils.import_utils import is_torch_available
+
+from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
+
+
+if is_torch_available():
+    import torch
+
+global_rng = random.Random()
+
+
+def floats_list(shape, scale=1.0, rng=None, name=None):
+    """Creates a random float32 tensor"""
+    if rng is None:
+        rng = global_rng
+
+    values = []
+    for batch_idx in range(shape[0]):
+        values.append([])
+        for _ in range(shape[1]):
+            values[-1].append(rng.random() * scale)
+
+    return values
+
+
+class WhisperFeatureExtractionTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        min_seq_length=400,
+        max_seq_length=2000,
+        feature_size=10,
+        hop_length=160,
+        chunk_length=8,
+        padding_value=0.0,
+        sampling_rate=4_000,
+        return_attention_mask=False,
+        do_normalize=True,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.min_seq_length = min_seq_length
+        self.max_seq_length = max_seq_length
+        self.seq_length_diff = (self.max_seq_length - self.min_seq_length) // (self.batch_size - 1)
+        self.padding_value = padding_value
+        self.sampling_rate = sampling_rate
+        self.return_attention_mask = return_attention_mask
+        self.do_normalize = do_normalize
+        self.feature_size = feature_size
+        self.chunk_length = chunk_length
+        self.hop_length = hop_length
+
+    def prepare_feat_extract_dict(self):
+        return {
+            "feature_size": self.feature_size,
+            "hop_length": self.hop_length,
+            "chunk_length": self.chunk_length,
+            "padding_value": self.padding_value,
+            "sampling_rate": self.sampling_rate,
+            "return_attention_mask": self.return_attention_mask,
+            "do_normalize": self.do_normalize,
+        }
+
+    def prepare_inputs_for_common(self, equal_length=False, numpify=False):
+        def _flatten(list_of_lists):
+            return list(itertools.chain(*list_of_lists))
+
+        if equal_length:
+            speech_inputs = [floats_list((self.max_seq_length, self.feature_size)) for _ in range(self.batch_size)]
+        else:
+            # make sure that inputs increase in size
+            speech_inputs = [
+                floats_list((x, self.feature_size))
+                for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff)
+            ]
+        if numpify:
+            speech_inputs = [np.asarray(x) for x in speech_inputs]
+        return speech_inputs
+
+
+class WhisperFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
+    feature_extraction_class = WhisperFeatureExtractor
+
+    def setUp(self):
+        self.feat_extract_tester = WhisperFeatureExtractionTester(self)
+
+    def test_feat_extract_from_and_save_pretrained(self):
+        feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            saved_file = feat_extract_first.save_pretrained(tmpdirname)[0]
+            check_json_file_has_correct_format(saved_file)
+            feat_extract_second = self.feature_extraction_class.from_pretrained(tmpdirname)
+
+        dict_first = feat_extract_first.to_dict()
+        dict_second = feat_extract_second.to_dict()
+        mel_1 = feat_extract_first.mel_filters
+        mel_2 = feat_extract_second.mel_filters
+        self.assertTrue(np.allclose(mel_1, mel_2))
+        self.assertEqual(dict_first, dict_second)
+
+    def test_feat_extract_to_json_file(self):
+        feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            json_file_path = os.path.join(tmpdirname, "feat_extract.json")
+            feat_extract_first.to_json_file(json_file_path)
+            feat_extract_second = self.feature_extraction_class.from_json_file(json_file_path)
+
+        dict_first = feat_extract_first.to_dict()
+        dict_second = feat_extract_second.to_dict()
+        mel_1 = feat_extract_first.mel_filters
+        mel_2 = feat_extract_second.mel_filters
+        self.assertTrue(np.allclose(mel_1, mel_2))
+        self.assertEqual(dict_first, dict_second)
+
+    def test_feat_extract_from_pretrained_kwargs(self):
+        feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            saved_file = feat_extract_first.save_pretrained(tmpdirname)[0]
+            check_json_file_has_correct_format(saved_file)
+            feat_extract_second = self.feature_extraction_class.from_pretrained(
+                tmpdirname, feature_size=2 * self.feat_extract_dict["feature_size"]
+            )
+
+        mel_1 = feat_extract_first.mel_filters
+        mel_2 = feat_extract_second.mel_filters
+        self.assertTrue(2 * mel_1.shape[1] == mel_2.shape[1])
+
+    def test_call(self):
+        # Tests that all call wrap to encode_plus and batch_encode_plus
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
+        # create three inputs of length 800, 1000, and 1200
+        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
+        np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
+
+        # Test feature size
+        input_features = feature_extractor(np_speech_inputs, padding="max_length", return_tensors="np").input_features
+        self.assertTrue(input_features.ndim == 3)
+        self.assertTrue(input_features.shape[-1] == feature_extractor.nb_max_frames)
+        self.assertTrue(input_features.shape[-2] == feature_extractor.feature_size)
+
+        # Test not batched input
+        encoded_sequences_1 = feature_extractor(speech_inputs[0], return_tensors="np").input_features
+        encoded_sequences_2 = feature_extractor(np_speech_inputs[0], return_tensors="np").input_features
+        self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3))
+
+        # Test batched
+        encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features
+        encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
+        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
+            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
+
+        # Test 2-D numpy arrays are batched.
+        speech_inputs = [floats_list((1, x))[0] for x in (800, 800, 800)]
+        np_speech_inputs = np.asarray(speech_inputs)
+        encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features
+        encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
+        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
+            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
+
+        # Test truncation required
+        speech_inputs = [floats_list((1, x))[0] for x in range(200, (feature_extractor.n_samples + 500), 200)]
+        np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
+
+        speech_inputs_truncated = [x[: feature_extractor.n_samples] for x in speech_inputs]
+        np_speech_inputs_truncated = [np.asarray(speech_input) for speech_input in speech_inputs_truncated]
+
+        encoded_sequences_1 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
+        encoded_sequences_2 = feature_extractor(np_speech_inputs_truncated, return_tensors="np").input_features
+        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
+            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
+
+    def test_dither(self):
+        np.random.seed(42)  # seed the dithering randn()
+
+        # Tests that features with and without little dithering are similar, but not the same
+        dict_no_dither = self.feat_extract_tester.prepare_feat_extract_dict()
+        dict_no_dither["dither"] = 0.0
+
+        dict_dither = self.feat_extract_tester.prepare_feat_extract_dict()
+        dict_dither["dither"] = 0.00003  # approx. 1/32k
+
+        feature_extractor_no_dither = self.feature_extraction_class(**dict_no_dither)
+        feature_extractor_dither = self.feature_extraction_class(**dict_dither)
+
+        # create three inputs of length 800, 1000, and 1200
+        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
+        np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
+
+        # compute features
+        input_features_no_dither = feature_extractor_no_dither(
+            np_speech_inputs, padding=True, return_tensors="np", sampling_rate=dict_no_dither["sampling_rate"]
+        ).input_features
+        input_features_dither = feature_extractor_dither(
+            np_speech_inputs, padding=True, return_tensors="np", sampling_rate=dict_dither["sampling_rate"]
+        ).input_features
+
+        # test there is a difference between features (there's added noise to input signal)
+        diff = input_features_dither - input_features_no_dither
+
+        # features are not identical
+        self.assertTrue(np.abs(diff).mean() > 1e-6)
+        # features are not too different
+        self.assertTrue(np.abs(diff).mean() <= 1e-4)
+        self.assertTrue(np.abs(diff).max() <= 5e-3)
+
+    def test_feature_shape(self):
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
+        hop_length = feature_extractor.hop_length
+        test_inputs = np.random.randn(16000)
+
+        self.assertTrue(
+            feature_extractor(
+                [test_inputs[: hop_length * 5 + 1]],
+                return_attention_mask=True,
+                padding=False,
+                return_tensors="np",
+            ).attention_mask.shape[-1]
+            == 5
+        )
+        self.assertTrue(
+            feature_extractor(
+                [test_inputs[: hop_length * 5]],
+                return_attention_mask=True,
+                padding=False,
+                return_tensors="np",
+            ).attention_mask.shape[-1]
+            == 5
+        )
+        self.assertTrue(
+            feature_extractor(
+                [test_inputs[: hop_length * 5 - 1]],
+                return_attention_mask=True,
+                padding=False,
+                return_tensors="np",
+            ).attention_mask.shape[-1]
+            == 4
+        )
+
+    @require_torch
+    def test_double_precision_pad(self):
+        import torch
+
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
+        np_speech_inputs = np.random.rand(100, 32).astype(np.float64)
+        py_speech_inputs = np_speech_inputs.tolist()
+
+        for inputs in [py_speech_inputs, np_speech_inputs]:
+            np_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="np")
+            self.assertTrue(np_processed.input_features.dtype == np.float32)
+            pt_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="pt")
+            self.assertTrue(pt_processed.input_features.dtype == torch.float32)
+
+    def _load_datasamples(self, num_samples):
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        # automatic decoding with librispeech
+        speech_samples = ds.sort("id")[:num_samples]["audio"]
+
+        return [x["array"] for x in speech_samples]
+
+    @require_torch_accelerator
+    @require_torch
+    def test_torch_integration(self):
+        # fmt: off
+        EXPECTED_INPUT_FEATURES = torch.tensor(
+            [
+                0.1193, -0.0946, -0.1098, -0.0196, 0.0225, -0.0690, -0.1736, 0.0951,
+                0.0971, -0.0817, -0.0702, 0.0162, 0.0260, 0.0017, -0.0192, -0.1678,
+                0.0709, -0.1867, -0.0655, -0.0274, -0.0234, -0.1884, -0.0516, -0.0554,
+                -0.0274, -0.1425, -0.1423, 0.0837, 0.0377, -0.0854
+            ]
+        )
+        # fmt: on
+
+        input_speech = self._load_datasamples(1)
+        feature_extractor = WhisperFeatureExtractor()
+        input_features = feature_extractor(input_speech, return_tensors="pt").input_features
+
+        self.assertEqual(input_features.shape, (1, 80, 3000))
+        torch.testing.assert_close(input_features[0, 0, :30], EXPECTED_INPUT_FEATURES, rtol=1e-4, atol=1e-4)
+
+    @unittest.mock.patch("transformers.models.whisper.feature_extraction_whisper.is_torch_available", lambda: False)
+    def test_numpy_integration(self):
+        # fmt: off
+        EXPECTED_INPUT_FEATURES = np.array(
+            [
+                0.1193, -0.0946, -0.1098, -0.0196, 0.0225, -0.0690, -0.1736, 0.0951,
+                0.0971, -0.0817, -0.0702, 0.0162, 0.0260, 0.0017, -0.0192, -0.1678,
+                0.0709, -0.1867, -0.0655, -0.0274, -0.0234, -0.1884, -0.0516, -0.0554,
+                -0.0274, -0.1425, -0.1423, 0.0837, 0.0377, -0.0854
+            ]
+        )
+        # fmt: on
+
+        input_speech = self._load_datasamples(1)
+        feature_extractor = WhisperFeatureExtractor()
+        input_features = feature_extractor(input_speech, return_tensors="np").input_features
+        self.assertEqual(input_features.shape, (1, 80, 3000))
+        self.assertTrue(np.allclose(input_features[0, 0, :30], EXPECTED_INPUT_FEATURES, atol=1e-4))
+
+    def test_zero_mean_unit_variance_normalization_trunc_np_longest(self):
+        feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
+        audio = self._load_datasamples(1)[0]
+        audio = ((audio - audio.min()) / (audio.max() - audio.min())) * 65535  # Rescale to [0, 65535] to show issue
+        audio = feat_extract.zero_mean_unit_var_norm([audio], attention_mask=None)[0]
+
+        self.assertTrue(np.all(np.mean(audio) < 1e-3))
+        self.assertTrue(np.all(np.abs(np.var(audio) - 1) < 1e-3))
+
+    @require_torch_accelerator
+    @require_torch
+    def test_torch_integration_batch(self):
+        # fmt: off
+        EXPECTED_INPUT_FEATURES = torch.tensor(
+            [
+                [
+                    0.1193, -0.0946, -0.1098, -0.0196, 0.0225, -0.0690, -0.1736, 0.0951,
+                    0.0971, -0.0817, -0.0702, 0.0162, 0.0260, 0.0017, -0.0192, -0.1678,
+                    0.0709, -0.1867, -0.0655, -0.0274, -0.0234, -0.1884, -0.0516, -0.0554,
+                    -0.0274, -0.1425, -0.1423, 0.0837, 0.0377, -0.0854
+                ],
+                [
+                    -0.4696, -0.0751, 0.0276, -0.0312, -0.0540, -0.0383, 0.1295, 0.0568,
+                    -0.2071, -0.0548, 0.0389, -0.0316, -0.2346, -0.1068, -0.0322, 0.0475,
+                    -0.1709, -0.0041, 0.0872, 0.0537, 0.0075, -0.0392, 0.0371, 0.0189,
+                    -0.1522, -0.0270, 0.0744, 0.0738, -0.0245, -0.0667
+                ],
+                [
+                    -0.2337, -0.0060, -0.0063, -0.2353, -0.0431, 0.1102, -0.1492, -0.0292,
+                     0.0787, -0.0608, 0.0143, 0.0582, 0.0072, 0.0101, -0.0444, -0.1701,
+                     -0.0064, -0.0027, -0.0826, -0.0730, -0.0099, -0.0762, -0.0170, 0.0446,
+                     -0.1153, 0.0960, -0.0361, 0.0652, 0.1207, 0.0277
+                ]
+            ]
+        )
+        # fmt: on
+
+        with torch.device("cuda"):
+            input_speech = self._load_datasamples(3)
+            feature_extractor = WhisperFeatureExtractor()
+            input_features = feature_extractor(input_speech, return_tensors="pt").input_features
+        self.assertEqual(input_features.shape, (3, 80, 3000))
+        torch.testing.assert_close(input_features[:, 0, :30], EXPECTED_INPUT_FEATURES, rtol=1e-4, atol=1e-4)
--- a/transformers/tests/models/whisper/test_modeling_whisper.py
+++ b/transformers/tests/models/whisper/test_modeling_whisper.py
--- a/transformers/tests/models/whisper/test_processing_whisper.py
+++ b/transformers/tests/models/whisper/test_processing_whisper.py
@@ -0,0 +1,461 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import shutil
+import tempfile
+import unittest
+
+import numpy as np
+import pytest
+
+from transformers import WhisperTokenizer, WhisperTokenizerFast, is_speech_available
+from transformers.testing_utils import require_sentencepiece, require_torch, require_torchaudio
+
+from .test_feature_extraction_whisper import floats_list
+
+
+if is_speech_available():
+    from transformers import WhisperFeatureExtractor, WhisperProcessor
+
+
+TRANSCRIBE = 50358
+NOTIMESTAMPS = 50362
+
+
+@require_torch
+@require_torchaudio
+@require_sentencepiece
+class WhisperProcessorTest(unittest.TestCase):
+    def setUp(self):
+        self.checkpoint = "openai/whisper-small.en"
+        self.tmpdirname = tempfile.mkdtemp()
+
+    def get_tokenizer(self, **kwargs):
+        return WhisperTokenizer.from_pretrained(self.checkpoint, **kwargs)
+
+    def get_feature_extractor(self, **kwargs):
+        return WhisperFeatureExtractor.from_pretrained(self.checkpoint, **kwargs)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
+
+    def test_save_load_pretrained_default(self):
+        tokenizer = self.get_tokenizer()
+        feature_extractor = self.get_feature_extractor()
+
+        processor = WhisperProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+
+        processor.save_pretrained(self.tmpdirname)
+        processor = WhisperProcessor.from_pretrained(self.tmpdirname)
+
+        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
+        self.assertIsInstance(processor.tokenizer, WhisperTokenizerFast)
+
+        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
+        self.assertIsInstance(processor.feature_extractor, WhisperFeatureExtractor)
+
+    def test_save_load_pretrained_additional_features(self):
+        processor = WhisperProcessor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor())
+        processor.save_pretrained(self.tmpdirname)
+
+        tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
+        feature_extractor_add_kwargs = self.get_feature_extractor(do_normalize=False, padding_value=1.0)
+
+        processor = WhisperProcessor.from_pretrained(
+            self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
+        )
+
+        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
+        self.assertIsInstance(processor.tokenizer, WhisperTokenizerFast)
+
+        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
+        self.assertIsInstance(processor.feature_extractor, WhisperFeatureExtractor)
+
+    def test_feature_extractor(self):
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+
+        processor = WhisperProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+
+        raw_speech = floats_list((3, 1000))
+
+        input_feat_extract = feature_extractor(raw_speech, return_tensors="np")
+        input_processor = processor(raw_speech, return_tensors="np")
+
+        for key in input_feat_extract:
+            self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
+
+    def test_tokenizer(self):
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+
+        processor = WhisperProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+
+        input_str = "This is a test string"
+
+        encoded_processor = processor(text=input_str)
+
+        encoded_tok = tokenizer(input_str)
+
+        for key in encoded_tok:
+            self.assertListEqual(encoded_tok[key], encoded_processor[key])
+
+    def test_tokenizer_decode(self):
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+
+        processor = WhisperProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+
+        predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
+
+        decoded_processor = processor.batch_decode(predicted_ids)
+        decoded_tok = tokenizer.batch_decode(predicted_ids)
+
+        self.assertListEqual(decoded_tok, decoded_processor)
+
+    def test_get_decoder_prompt_ids(self):
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+
+        processor = WhisperProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        forced_decoder_ids = processor.get_decoder_prompt_ids(task="transcribe", no_timestamps=True)
+
+        self.assertIsInstance(forced_decoder_ids, list)
+        for ids in forced_decoder_ids:
+            self.assertIsInstance(ids, (list, tuple))
+
+        expected_ids = [TRANSCRIBE, NOTIMESTAMPS]
+        self.assertListEqual([ids[-1] for ids in forced_decoder_ids], expected_ids)
+
+    def test_get_prompt_ids(self):
+        processor = WhisperProcessor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor())
+        prompt_ids = processor.get_prompt_ids("Mr. Quilter")
+        decoded_prompt = processor.tokenizer.decode(prompt_ids)
+
+        self.assertListEqual(prompt_ids.tolist(), [50360, 1770, 13, 2264, 346, 353])
+        self.assertEqual(decoded_prompt, "<|startofprev|> Mr. Quilter")
+
+    def test_empty_get_prompt_ids(self):
+        processor = WhisperProcessor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor())
+        prompt_ids = processor.get_prompt_ids("")
+        decoded_prompt = processor.tokenizer.decode(prompt_ids)
+
+        self.assertListEqual(prompt_ids.tolist(), [50360, 220])
+        self.assertEqual(decoded_prompt, "<|startofprev|> ")
+
+    def test_get_prompt_ids_with_special_tokens(self):
+        processor = WhisperProcessor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor())
+
+        def _test_prompt_error_raised_helper(prompt, special_token):
+            with pytest.raises(ValueError) as excinfo:
+                processor.get_prompt_ids(prompt)
+            expected = f"Encountered text in the prompt corresponding to disallowed special token: {special_token}."
+            self.assertEqual(expected, str(excinfo.value))
+
+        _test_prompt_error_raised_helper("<|startofprev|> test", "<|startofprev|>")
+        _test_prompt_error_raised_helper("test <|notimestamps|>", "<|notimestamps|>")
+        _test_prompt_error_raised_helper("test <|zh|> test <|transcribe|>", "<|zh|>")
+
+    def test_find_longest_common_subsequence_old(self):
+        """Test using the old processing functions used in the ASR pipeline, but that serves as a BC reference."""
+        max_source_positions = 1500
+        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
+
+        previous_sequence = [[51492, 406, 3163, 1953, 466, 13, 51612, 51612]]
+        self.assertEqual(
+            processor.decode(previous_sequence[0], output_offsets=True),
+            {
+                "text": " not worth thinking about.",
+                "offsets": [{"text": " not worth thinking about.", "timestamp": (22.56, 24.96)}],
+            },
+        )
+
+        # Merge when the previous sequence is a suffix of the next sequence
+        # fmt: off
+        next_sequences_1 = [
+            [50364, 295, 6177, 3391, 11, 19817, 3337, 507, 307, 406, 3163, 1953, 466, 13, 50614, 50614, 2812, 9836, 14783, 390, 6263, 538, 257, 1359, 11, 8199, 6327, 1090, 322, 702, 7443, 13, 50834, 50257]
+        ]
+        # fmt: on
+        self.assertEqual(
+            processor.decode(next_sequences_1[0], output_offsets=True),
+            {
+                "text": (
+                    " of spectators, retrievality is not worth thinking about. His instant panic was followed by a"
+                    " small, sharp blow high on his chest.<|endoftext|>"
+                ),
+                "offsets": [
+                    {"text": " of spectators, retrievality is not worth thinking about.", "timestamp": (0.0, 5.0)},
+                    {
+                        "text": " His instant panic was followed by a small, sharp blow high on his chest.",
+                        "timestamp": (5.0, 9.4),
+                    },
+                ],
+            },
+        )
+        merge = _find_timestamp_sequence(
+            [[previous_sequence, (480_000, 0, 0)], [next_sequences_1, (480_000, 120_000, 0)]],
+            processor.tokenizer,
+            processor.feature_extractor,
+            max_source_positions,
+        )
+
+        # fmt: off
+        self.assertEqual(
+            merge,
+            [51492, 406, 3163, 1953, 466, 13, 51739, 51739, 2812, 9836, 14783, 390, 6263, 538, 257, 1359, 11, 8199, 6327, 1090, 322, 702, 7443, 13, 51959],
+        )
+        # fmt: on
+        self.assertEqual(
+            processor.decode(merge, output_offsets=True),
+            {
+                "text": (
+                    " not worth thinking about. His instant panic was followed by a small, sharp blow high on his"
+                    " chest."
+                ),
+                "offsets": [
+                    {"text": " not worth thinking about.", "timestamp": (22.56, 27.5)},
+                    {
+                        "text": " His instant panic was followed by a small, sharp blow high on his chest.",
+                        "timestamp": (27.5, 31.900000000000002),
+                    },
+                ],
+            },
+        )
+
+        # Merge when the sequence is in the middle of the 1st next sequence
+        # fmt: off
+        next_sequences_2 = [
+            [50364, 295, 6177, 3391, 11, 19817, 3337, 507, 307, 406, 3163, 1953, 466, 13, 2812, 9836, 14783, 390, 6263, 538, 257, 1359, 11, 8199, 6327, 1090, 322, 702, 7443, 13, 50834, 50257]
+        ]
+        # fmt: on
+        # {'text': ' of spectators, retrievality is not worth thinking about. His instant panic was followed by a small, sharp blow high on his chest.','timestamp': (0.0, 9.4)}
+        merge = _find_timestamp_sequence(
+            [[previous_sequence, (480_000, 0, 0)], [next_sequences_2, (480_000, 120_000, 0)]],
+            processor.tokenizer,
+            processor.feature_extractor,
+            max_source_positions,
+        )
+        # fmt: off
+        self.assertEqual(
+            merge,
+            [51492, 406, 3163, 1953, 466, 13, 2812, 9836, 14783, 390, 6263, 538, 257, 1359, 11, 8199, 6327, 1090, 322, 702, 7443, 13, 51959],
+        )
+        # fmt: on
+        self.assertEqual(
+            processor.decode(merge, output_offsets=True),
+            {
+                "text": (
+                    " not worth thinking about. His instant panic was followed by a small, sharp blow high on his"
+                    " chest."
+                ),
+                "offsets": [
+                    {
+                        "text": (
+                            " not worth thinking about. His instant panic was followed by a small, sharp blow high on"
+                            " his chest."
+                        ),
+                        "timestamp": (22.56, 31.900000000000002),
+                    },
+                ],
+            },
+        )
+
+        # Merge when the previous sequence is not included in the current sequence
+        next_sequences_3 = [[50364, 2812, 9836, 14783, 390, 6263, 538, 257, 1359, 11, 8199, 6327, 1090, 322, 702, 7443, 13, 50584, 50257]]  # fmt: skip
+        # {'text': ' His instant panic was followed by a small, sharp blow high on his chest.','timestamp': (0.0, 9.4)}
+        merge = _find_timestamp_sequence(
+            [[previous_sequence, (480_000, 0, 0)], [next_sequences_3, (480_000, 120_000, 0)]],
+            processor.tokenizer,
+            processor.feature_extractor,
+            max_source_positions,
+        )
+        self.assertEqual(
+            merge,
+            [51492, 406, 3163, 1953, 466, 13, 51612, 51612, 2812, 9836, 14783, 390, 6263, 538, 257, 1359, 11, 8199, 6327, 1090, 322, 702, 7443, 13, 51832],
+        )  # fmt: skip
+        self.assertEqual(
+            processor.decode(merge, output_offsets=True),
+            {
+                "text": (
+                    " not worth thinking about. His instant panic was followed by a small, sharp blow high on his"
+                    " chest."
+                ),
+                "offsets": [
+                    {"text": " not worth thinking about.", "timestamp": (22.56, 24.96)},
+                    {
+                        "text": " His instant panic was followed by a small, sharp blow high on his chest.",
+                        "timestamp": (24.96, 29.36),
+                    },
+                ],
+            },
+        )
+        # last case is when the sequence is not in the first next predicted start and end of timestamp
+        next_sequences_3 = [
+            [50364, 2812, 9836, 14783, 390, 406, 3163, 1953, 466, 13, 50634, 50634, 2812, 9836, 14783, 390, 6263, 538, 257, 1359, 11, 8199, 6327, 1090, 322, 702, 7443, 13, 50934]
+        ]  # fmt: skip
+        merge = _find_timestamp_sequence(
+            [[previous_sequence, (480_000, 0, 0)], [next_sequences_3, (480_000, 167_000, 0)]],
+            processor.tokenizer,
+            processor.feature_extractor,
+            max_source_positions,
+        )
+        self.assertEqual(
+            merge,
+            [51492, 406, 3163, 1953, 466, 13, 51612, 51612, 2812, 9836, 14783, 390, 6263, 538, 257, 1359, 11, 8199, 6327, 1090, 322, 702, 7443, 13, 51912]
+        )  # fmt: skip
+        self.assertEqual(
+            processor.decode(merge, output_offsets=True),
+            {
+                "text": (
+                    " not worth thinking about. His instant panic was followed by a small, sharp blow high on his"
+                    " chest."
+                ),
+                "offsets": [
+                    {"text": " not worth thinking about.", "timestamp": (22.56, 24.96)},
+                    {
+                        "text": " His instant panic was followed by a small, sharp blow high on his chest.",
+                        "timestamp": (24.96, 30.96),
+                    },
+                ],
+            },
+        )
+
+
+def _fast_find_longest_common_sequence(sequence_left, sequence_right):
+    """Old processing function used in the ASR pipeline."""
+    seq_len_left = len(sequence_left)
+    seq_len_right = len(sequence_right)
+    counter = [[0] * (seq_len_right + 1) for _ in range(seq_len_left + 1)]
+    longest = 0
+    for i in range(seq_len_left):
+        for j in range(seq_len_right):
+            if sequence_left[i] == sequence_right[j]:
+                previous_counter = counter[i][j] + 1
+                counter[i + 1][j + 1] = previous_counter
+                if previous_counter > longest:
+                    longest = previous_counter
+
+    counter = np.array(counter)
+    # we return the idx of the first element of the longest common sequence in the left sequence
+    index_left = np.argwhere(counter == longest)[-1][0] - longest if longest != 0 else -1
+    index_right = np.argwhere(counter == longest)[-1][1] - longest if longest != 0 else -1
+    return index_left, index_right, longest
+
+
+def _find_timestamp_sequence(sequences, tokenizer, feature_extractor, max_source_positions):
+    """
+    Old processing function used in the ASR pipeline.
+
+    Computes the final sequences by merging the end of the nth sequence with the beginning of the n+1th sequence. Since
+    `WhisperForConditionalGeneration` produces the timestamps pairwise, we filter the consecutive timestamps and only
+    iterate over them. We keep track of the `time` which indicates the actual starting time of the chunk that is
+    processed. We need to make sure to offset the timestamps tokens by the `time` in order for the tokenizer to
+    properly compute the final `offset`.
+    """
+    # index of the first timestamp token
+    timestamp_begin = tokenizer.convert_tokens_to_ids("<|notimestamps|>") + 1
+    items = []
+    # approximation of the token to time ratio : ~0.2seconds
+    time_precision = feature_extractor.chunk_length / max_source_positions
+    time = 0
+    for seq_idx, item in enumerate(sequences):
+        sequence, stride = item
+        if isinstance(sequence, list):
+            sequence = np.array(sequence)
+        chunk_len, stride_left, stride_right = stride
+        sequence = sequence.squeeze(0)
+        # get rid of the `forced_decoder_idx` that are use to parametrize the generation
+        begin_idx = np.where(sequence == timestamp_begin)[0][0] if timestamp_begin in sequence else 0
+        sequence = sequence[begin_idx:]
+
+        timestamp_tokens = sequence >= timestamp_begin
+        if seq_idx != 0 and sum(timestamp_tokens) > 0:
+            consecutive = np.where(timestamp_tokens[:-1] & timestamp_tokens[1:])[0] + 1
+            last_timestamp = np.where(timestamp_tokens)[0][-1]
+            consecutive = np.append(consecutive, last_timestamp) if last_timestamp not in consecutive else consecutive
+            time -= stride_left + stride_right
+            offset = int((time / feature_extractor.sampling_rate) / time_precision)
+            overlap_time = int((stride_left / feature_extractor.sampling_rate) / time_precision)
+            # relevant timestamps are in the overlapping part
+            relevant_timestamp = np.where(sequence[consecutive] >= timestamp_begin + overlap_time)[0]
+            if relevant_timestamp.shape[0] > 0:
+                relevant_timestamp = (
+                    consecutive[relevant_timestamp[0] - 1] if relevant_timestamp[0] > 0 else consecutive[0]
+                )
+                # if a big stride is used, we need to check some of the previous items for the best overlap
+                best_match = 0
+                sliced_sequence = []
+                for idx, previous_sequence in enumerate(reversed(items)):
+                    previous_tokens = previous_sequence[1:-1]
+                    if previous_sequence[0] < (timestamp_begin + offset - overlap_time) and idx != 0:
+                        break  # the previous sequence is too far in the past
+                    if len(previous_tokens) > 0:
+                        # find the longest common sequence between the overlapping parts
+                        index_left, index_right, match_length = _fast_find_longest_common_sequence(
+                            sequence[1:relevant_timestamp], previous_tokens
+                        )
+                        # don't do anything if only 1 token was matched
+                        if match_length > 1 and match_length > best_match:
+                            best_match = match_length
+                            best_idx = idx
+                            end_of_curr_sequence_idx = (
+                                np.where(sequence[index_left + 1 :] >= timestamp_begin)[0][0] + 1
+                            )
+                            end_of_curr_sequence_idx = end_of_curr_sequence_idx + 1 + index_left
+                            # if all the tokens are matched, suffix
+                            if index_left == 0 and match_length == len(previous_tokens):
+                                sliced_sequence = np.insert(
+                                    sequence[index_left + 1 : end_of_curr_sequence_idx], 0, previous_sequence[0]
+                                )
+                                sliced_sequence[-1] = previous_sequence[-1]
+                            # if part of the previous sequence is not taken
+                            elif index_left >= 0:
+                                sliced_sequence = sequence[index_left + 1 : end_of_curr_sequence_idx]
+                                # let's insert the missing part of the previous sequence
+                                previous_slice = (
+                                    previous_sequence[: index_right + 1] if index_right > 0 else [previous_sequence[0]]
+                                )
+                                sliced_sequence = np.insert(sliced_sequence, 0, previous_slice)
+                                sliced_sequence[-1] += offset
+
+                if len(sliced_sequence) > 0:
+                    items[len(items) - best_idx - 1] = sliced_sequence
+                    items = items[: len(items) - best_idx]
+                    sequence = sequence[end_of_curr_sequence_idx:]
+
+        # sequence might have changed
+        timestamp_tokens = sequence >= timestamp_begin
+        consecutive = np.where(timestamp_tokens[:-1] & timestamp_tokens[1:])[0] + 1
+        if sum(timestamp_tokens) > 0:
+            last_timestamp = np.where(timestamp_tokens)[0][-1]
+            consecutive = (
+                np.append(consecutive, last_timestamp + 1) if last_timestamp not in consecutive else consecutive
+            )
+
+        if len(consecutive) > 0:
+            last_slice = 0
+            for current_slice in consecutive:
+                actual_offset = items[-1][-1] if seq_idx != 0 or last_slice != 0 else sequence[0]
+                sliced_tokens = sequence[last_slice:current_slice]
+                duration = sliced_tokens[-1] - sliced_tokens[0]
+                sliced_tokens[0] = actual_offset
+                sliced_tokens[-1] = actual_offset + duration
+                items.append(sliced_tokens)
+                last_slice = current_slice
+
+        time += chunk_len
+    result = []
+    for i in range(len(items)):
+        result += items[i].tolist()
+    return result
--- a/transformers/tests/models/whisper/test_tokenization_whisper.py
+++ b/transformers/tests/models/whisper/test_tokenization_whisper.py
@@ -0,0 +1,590 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+from transformers.models.whisper import WhisperTokenizer, WhisperTokenizerFast
+from transformers.models.whisper.tokenization_whisper import _combine_tokens_into_words, _find_longest_common_sequence
+from transformers.testing_utils import require_torch, slow
+
+from ...test_tokenization_common import TokenizerTesterMixin
+
+
+ES_CODE = 50262
+EN_CODE = 50259
+END_OF_TRANSCRIPT = 50257
+START_OF_TRANSCRIPT = 50258
+TRANSLATE = 50358
+TRANSCRIBE = 50359
+NOTIMESTAMPS = 50363
+
+
+class WhisperTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
+    from_pretrained_id = "openai/whisper-tiny"
+    tokenizer_class = WhisperTokenizer
+    rust_tokenizer_class = WhisperTokenizerFast
+    test_rust_tokenizer = True
+    test_sentencepiece = False
+    test_seq2seq = False
+
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+        tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny")
+        tokenizer.pad_token_id = 50256
+        tokenizer.pad_token = "<|endoftext|>"
+        tokenizer.save_pretrained(cls.tmpdirname)
+
+    def test_convert_token_and_id(self):
+        """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
+        token = "Where"
+        token_id = 14436
+
+        self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id)
+        self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token)
+
+    def test_get_vocab(self):
+        vocab_keys = list(self.get_tokenizer().get_vocab().keys())
+
+        self.assertEqual(vocab_keys[0], "!")
+        self.assertEqual(vocab_keys[1], '"')
+        self.assertEqual(vocab_keys[-1], "<|30.00|>")
+        self.assertEqual(len(vocab_keys), 51865)
+
+    def test_vocab_size(self):
+        self.assertEqual(self.get_tokenizer().vocab_size, 50258)
+
+    def test_full_tokenizer(self):
+        tokenizer = WhisperTokenizer.from_pretrained(self.tmpdirname)
+
+        tokens = tokenizer.tokenize("This is a test")
+        self.assertListEqual(tokens, ["This", "Ġis", "Ġa", "Ġtest"])
+
+        self.assertListEqual(
+            tokenizer.convert_tokens_to_ids(tokens),
+            [5723, 307, 257, 1500],
+        )
+
+        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
+        self.assertListEqual(
+            tokens,
+            ["I", "Ġwas", "Ġborn", "Ġin", "Ġ9", "2000", ",", "Ġand", "Ġthis", "Ġis", "Ġfals", "Ã©", "."],  # fmt: skip
+        )
+        ids = tokenizer.convert_tokens_to_ids(tokens)
+        self.assertListEqual(ids, [40, 390, 4232, 294, 1722, 25743, 11, 293, 341, 307, 16720, 526, 13])
+
+        back_tokens = tokenizer.convert_ids_to_tokens(ids)
+        self.assertListEqual(
+            back_tokens,
+            ["I", "Ġwas", "Ġborn", "Ġin", "Ġ9", "2000", ",", "Ġand", "Ġthis", "Ġis", "Ġfals", "Ã©", "."],  # fmt: skip
+        )
+
+    @unittest.skip
+    def test_tokenizer_slow_store_full_signature(self):
+        pass
+
+    @unittest.skip
+    def test_tokenizer_fast_store_full_signature(self):
+        pass
+
+    @unittest.skip
+    def test_special_tokens_initialization(self):
+        # Whisper relies on specific additional special tokens, so we skip this
+        # general test. In particular, this test loads fast tokenizer from slow
+        # tokenizer, and the conversion uses prefix_tokens, where we reference
+        # additional special tokens by specific indices, hence overriding the
+        # list with less tokens leads to out of index error
+        pass
+
+    @slow
+    def test_tokenizer_integration(self):
+        expected_encoding = {'input_ids': [[50257, 50362, 41762, 364, 357, 36234, 1900, 355, 12972, 13165, 354, 12, 35636, 364, 290, 12972, 13165, 354, 12, 5310, 13363, 12, 4835, 8, 3769, 2276, 12, 29983, 45619, 357, 13246, 51, 11, 402, 11571, 12, 17, 11, 5564, 13246, 38586, 11, 16276, 44, 11, 4307, 346, 33, 861, 11, 16276, 7934, 23029, 329, 12068, 15417, 28491, 357, 32572, 52, 8, 290, 12068, 15417, 16588, 357, 32572, 38, 8, 351, 625, 3933, 10, 2181, 13363, 4981, 287, 1802, 10, 8950, 290, 2769, 48817, 1799, 1022, 449, 897, 11, 9485, 15884, 354, 290, 309, 22854, 37535, 13, 50256], [50257, 50362, 13246, 51, 318, 3562, 284, 662, 12, 27432, 2769, 8406, 4154, 282, 24612, 422, 9642, 9608, 276, 2420, 416, 26913, 21143, 319, 1111, 1364, 290, 826, 4732, 287, 477, 11685, 13, 50256], [50257, 50362, 464, 2068, 7586, 21831, 18045, 625, 262, 16931, 3290, 13, 50256]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}  # fmt: skip
+
+        self.tokenizer_integration_test_util(
+            expected_encoding=expected_encoding, model_name="openai/whisper-tiny.en", padding=False
+        )
+
+    def test_output_offsets(self):
+        tokenizer = self.get_tokenizer()
+        previous_sequence = [51492, 406, 3163, 1953, 466, 13, 51612, 51612]
+        self.assertEqual(
+            tokenizer.decode(previous_sequence, output_offsets=True),
+            {
+                "text": " not worth thinking about.",
+                "offsets": [{"text": " not worth thinking about.", "timestamp": (22.56, 24.96)}],
+            },
+        )
+
+        # Merge when the previous sequence is a suffix of the next sequence
+        next_sequences_1 = [50364, 295, 6177, 3391, 11, 19817, 3337, 507, 307, 406, 3163, 1953, 466, 13, 50614, 50614, 2812, 9836, 14783, 390, 6263, 538, 257, 1359, 11, 8199, 6327, 1090, 322, 702, 7443, 13, 50834, 50257]  # fmt: skip
+        self.assertEqual(
+            tokenizer.decode(next_sequences_1, output_offsets=True),
+            {
+                "text": (
+                    " of spectators, retrievality is not worth thinking about. His instant panic was followed by a"
+                    " small, sharp blow high on his chest.<|endoftext|>"
+                ),
+                "offsets": [
+                    {"text": " of spectators, retrievality is not worth thinking about.", "timestamp": (0.0, 5.0)},
+                    {
+                        "text": " His instant panic was followed by a small, sharp blow high on his chest.",
+                        "timestamp": (5.0, 9.4),
+                    },
+                ],
+            },
+        )
+
+    def test_find_longest_common_subsequence(self):
+        previous_sequence = [1, 2, 3]
+        next_sequence = [2, 3, 4, 5]
+        merge = _find_longest_common_sequence([previous_sequence, next_sequence])
+        self.assertEqual(merge, [1, 2, 3, 4, 5])
+
+        # Now previous is larger than next.
+        # We merge what we can and remove the extra right side of the left sequence
+        previous_sequence = [1, 2, 3, 4, 5, 6, 7]
+        next_sequence = [2, 3, 4, 5]
+        merge = _find_longest_common_sequence([previous_sequence, next_sequence])
+        self.assertEqual(merge, [1, 2, 3, 4, 5])
+
+        # Nothing in common
+        previous_sequence = [1, 2, 3]
+        next_sequence = [4, 5, 6]
+        merge = _find_longest_common_sequence([previous_sequence, next_sequence])
+        self.assertEqual(merge, [1, 2, 3, 4, 5, 6])
+
+        # Some errors in the overlap.
+        # We take from previous on the left, from the next on the right of the overlap
+        previous_sequence = [1, 2, 3, 4, 99]
+        next_sequence = [2, 98, 4, 5, 6]
+        merge = _find_longest_common_sequence([previous_sequence, next_sequence])
+        self.assertEqual(merge, [1, 2, 3, 4, 5, 6])
+
+        # We take from previous on the left, from the next on the right of the overlap
+        previous_sequence = [1, 2, 99, 4, 5]
+        next_sequence = [2, 3, 4, 98, 6]
+        merge = _find_longest_common_sequence([previous_sequence, next_sequence])
+        self.assertEqual(merge, [1, 2, 99, 4, 98, 6])
+
+        # This works on 3 sequences
+        seq1 = [1, 2, 3]
+        seq2 = [2, 3, 4]
+        seq3 = [3, 4, 5]
+        merge = _find_longest_common_sequence([seq1, seq2, seq3])
+        self.assertEqual(merge, [1, 2, 3, 4, 5])
+
+        # This works on 3 sequences with errors
+        seq1 = [1, 2, 3, 98, 5]
+        seq2 = [2, 99, 4, 5, 6, 7]
+        seq3 = [4, 97, 6, 7, 8]
+        merge = _find_longest_common_sequence([seq1, seq2, seq3])
+        self.assertEqual(merge, [1, 2, 3, 4, 5, 6, 7, 8])
+
+    def test_skip_special_tokens_skips_prompt_ids(self):
+        tokenizer = self.get_tokenizer()
+        rust_tokenizer = self.get_rust_tokenizer()
+        # fmt: off
+        encoded_input = [
+            50361, 2221, 13, 2326, 388, 391, 50258, 50259, 50359,
+            50363, 1282, 264, 2674, 9156, 295, 1523, 11, 2221, 13,
+            2326, 388, 391, 13657, 365, 2681, 21296, 17711, 13, 50257,
+        ]
+        # fmt: on
+        expected_with_special_tokens = "<|startofprev|> Mr. Quilter<|startoftranscript|><|en|><|transcribe|><|notimestamps|> On the general principles of art, Mr. Quilter writes with equal lucidity.<|endoftext|>"
+        expected_without_special_tokens = " On the general principles of art, Mr. Quilter writes with equal lucidity."
+        self.assertEqual(tokenizer.decode(encoded_input, skip_special_tokens=False), expected_with_special_tokens)
+        self.assertEqual(tokenizer.decode(encoded_input, skip_special_tokens=True), expected_without_special_tokens)
+        self.assertEqual(rust_tokenizer.decode(encoded_input, skip_special_tokens=False), expected_with_special_tokens)
+        self.assertEqual(
+            rust_tokenizer.decode(encoded_input, skip_special_tokens=True), expected_without_special_tokens
+        )
+
+    def test_skip_special_tokens_with_timestamps(self):
+        tokenizer = self.get_tokenizer()
+        rust_tokenizer = self.get_rust_tokenizer()
+
+        # fmt: off
+        encoded_input = [
+            50258, 50363, 50364, 634, 575, 12525, 22618, 1968, 6144,
+            35617, 20084, 1756, 311, 589, 307, 534, 10281, 934,
+            439, 293, 50676, 50676, 393, 4411, 294, 309, 457,
+            707, 295, 33301, 286, 392, 6628, 13, 50836, 50257,
+        ]
+        # fmt: on
+
+        expected_with_special_tokens = "<|startoftranscript|><|notimestamps|><|0.00|> He has grave doubts whether Sir Frederick Layton's work is really Greek after all and<|6.24|><|6.24|> can discover in it but little of rocky Ithaca.<|9.44|><|endoftext|>"
+        expected_without_special_tokens = "<|0.00|> He has grave doubts whether Sir Frederick Layton's work is really Greek after all and<|6.24|><|6.24|> can discover in it but little of rocky Ithaca.<|9.44|>"
+        self.assertEqual(
+            tokenizer.decode(encoded_input, decode_with_timestamps=True, skip_special_tokens=False),
+            expected_with_special_tokens,
+        )
+        self.assertEqual(
+            tokenizer.decode(encoded_input, decode_with_timestamps=True, skip_special_tokens=True),
+            expected_without_special_tokens,
+        )
+        self.assertEqual(
+            rust_tokenizer.decode(encoded_input, decode_with_timestamps=True, skip_special_tokens=False),
+            expected_with_special_tokens,
+        )
+        self.assertEqual(
+            rust_tokenizer.decode(encoded_input, decode_with_timestamps=True, skip_special_tokens=True),
+            expected_without_special_tokens,
+        )
+
+    def test_fast_tokenizer_get_prompt_ids(self):
+        tokenizer = self.get_tokenizer()
+        rust_tokenizer = self.get_rust_tokenizer()
+
+        prompt = "This is test prompt text."
+        tokenizer_prompt_ids = tokenizer.get_prompt_ids(prompt)
+        fast_tokenizer_prompt_ids = rust_tokenizer.get_prompt_ids(prompt)
+
+        self.assertListEqual(tokenizer_prompt_ids.tolist(), fast_tokenizer_prompt_ids.tolist())
+
+    def test_tokenizer_decode_prompt(self):
+        prompt_text = "What does the fox say?"
+        input_text = "Hatee hatee hatee ho"
+
+        tokenizer = self.get_tokenizer()
+        rust_tokenizer = self.get_rust_tokenizer()
+
+        # encode prompt and input text using tokenizer
+        prompt_ids = tokenizer.get_prompt_ids(prompt_text, return_tensors="np")
+        input_ids = tokenizer(input_text, return_tensors="np").input_ids[0]
+        input_ids = np.hstack([prompt_ids, input_ids])
+
+        # encode using fast tokenizer
+        rust_prompt_ids = rust_tokenizer.get_prompt_ids(prompt_text, return_tensors="np")
+        rust_input_ids = rust_tokenizer(input_text, return_tensors="np").input_ids[0]
+        rust_input_ids = np.hstack([rust_prompt_ids, rust_input_ids])
+
+        # check with prompt in output
+        pred_text = tokenizer.decode(input_ids, skip_special_tokens=False)
+        rust_pred_text = rust_tokenizer.decode(rust_input_ids, skip_special_tokens=False)
+
+        # check correctness for both tokenizers
+        expected_text = f"<|startofprev|> {prompt_text}<|startoftranscript|><|notimestamps|>{input_text}<|endoftext|>"
+        self.assertEqual(pred_text.strip(), expected_text)
+        self.assertEqual(rust_pred_text.strip(), expected_text)
+
+        # check stripping prompt from output
+        pred_text = tokenizer.decode(input_ids, skip_special_tokens=True)
+        rust_pred_text = tokenizer.decode(input_ids, skip_special_tokens=True)
+
+        self.assertEqual(pred_text.strip(), input_text)
+        self.assertEqual(rust_pred_text.strip(), input_text)
+
+    def test_combine_tokens_into_words(self):
+        tokenizer = self.get_tokenizer()
+        rust_tokenizer = self.get_rust_tokenizer()
+
+        # 'whatever "whatever" said someone, clever!?'
+        encoded_input = [1363, 7969, 503, 1363, 7969, 1, 848, 1580, 11, 13494, 7323]
+        expected_words = ["whatever", ' "whatever"', " said", " someone,", " clever!?"]
+        expected_tokens = [[1363, 7969], [503, 1363, 7969, 1], [848], [1580, 11], [13494, 7323]]
+        expected_indices = [[0, 1], [2, 3, 4, 5], [6], [7, 8], [9, 10]]
+        output = _combine_tokens_into_words(tokenizer, encoded_input)
+        self.assertEqual(expected_words, output[0])
+        self.assertEqual(expected_tokens, output[1])
+        self.assertEqual(expected_indices, output[2])
+        output_rust = _combine_tokens_into_words(rust_tokenizer, encoded_input)
+        self.assertEqual(expected_words, output_rust[0])
+        self.assertEqual(expected_tokens, output_rust[1])
+        self.assertEqual(expected_indices, output_rust[2])
+
+    def test_basic_normalizer(self):
+        tokenizer = self.get_tokenizer()
+        rust_tokenizer = self.get_rust_tokenizer()
+
+        input_str = "Hola güey!"
+        expected_output_normalize = "hola güey "
+        expected_output_diacritics = "hola guey "
+
+        # tokenizer tests
+        encoded_input = tokenizer(input_str).input_ids
+        decoded_output = tokenizer.decode(encoded_input, skip_special_tokens=True, basic_normalize=False)
+        self.assertEqual(decoded_output, input_str)
+
+        decoded_output_normalize = tokenizer.decode(encoded_input, skip_special_tokens=True, basic_normalize=True)
+        self.assertEqual(decoded_output_normalize, expected_output_normalize)
+
+        decoded_output_diacritics = tokenizer.decode(
+            encoded_input, skip_special_tokens=True, basic_normalize=True, remove_diacritics=True
+        )
+        self.assertEqual(decoded_output_diacritics, expected_output_diacritics)
+
+        # fast tokenizer tests
+        encoded_input = rust_tokenizer(input_str).input_ids
+        decoded_output = rust_tokenizer.decode(encoded_input, skip_special_tokens=True, basic_normalize=False)
+        self.assertEqual(decoded_output, input_str)
+
+        decoded_output_normalize = rust_tokenizer.decode(encoded_input, skip_special_tokens=True, basic_normalize=True)
+        self.assertEqual(decoded_output_normalize, expected_output_normalize)
+
+        decoded_output_diacritics = rust_tokenizer.decode(
+            encoded_input, skip_special_tokens=True, basic_normalize=True, remove_diacritics=True
+        )
+        self.assertEqual(decoded_output_diacritics, expected_output_diacritics)
+
+    def test_decode_asr_with_word_level_timestamps(self):
+        # fmt: off
+        model_outputs = [
+            {
+                'stride': [10, 0, 5],
+                'tokens': np.array([[50363, 3363, 11, 345, 460, 0, 50423]]),
+                'token_timestamps': np.array([[0.0, 0.5, 0.52, 0.78, 1.2, 1.28, 1.28]])
+            }
+        ]
+        # fmt: on
+
+        tokenizer = WhisperTokenizer.from_pretrained("onnx-community/whisper-tiny.en_timestamped")
+        result = tokenizer._decode_asr(
+            model_outputs, return_timestamps="word", return_language=False, time_precision=0.02
+        )
+
+        EXPECTED_OUTPUT = (
+            " Yes, you can!",
+            {
+                "chunks": [
+                    {"text": " Yes,", "timestamp": (0.0, 0.52)},
+                    {"text": " you", "timestamp": (0.52, 0.78)},
+                    {"text": " can!", "timestamp": (0.78, 1.28)},
+                ]
+            },
+        )
+        self.assertEqual(result, EXPECTED_OUTPUT)
+
+
+class SpeechToTextTokenizerMultilinguialTest(unittest.TestCase):
+    checkpoint_name = "openai/whisper-small.en"
+
+    @classmethod
+    def setUpClass(cls):
+        cls.tokenizer: WhisperTokenizer = WhisperTokenizer.from_pretrained(cls.checkpoint_name)
+        return cls
+
+    def test_tokenizer_equivalence(self):
+        text = "다람쥐 헌 쳇바퀴에 타고파"
+        multilingual_tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny", language="korean")
+        monolingual_tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny.en")
+
+        monolingual_tokens = monolingual_tokenizer.encode(text, add_special_tokens=False)
+        multilingual_tokens = multilingual_tokenizer.encode(text, add_special_tokens=False)
+
+        assert monolingual_tokenizer.decode(monolingual_tokens) == text
+        assert multilingual_tokenizer.decode(multilingual_tokens) == text
+        assert len(monolingual_tokens) > len(multilingual_tokens)
+
+        # fmt: off
+        EXPECTED_ENG = [
+            46695, 97, 167, 252, 234, 168, 98, 238, 220, 169,
+            245, 234, 23821, 111, 229, 167, 108, 242, 169, 222,
+            112, 168, 245, 238, 220, 169, 225, 222, 166, 111,
+            254, 169, 234, 234
+        ]
+        EXPECTED_MULTI = [
+            9835, 22855, 168, 98, 238, 13431, 234, 43517, 229, 47053,
+            169, 222, 19086, 19840, 1313, 17974
+        ]
+        # fmt: on
+
+        self.assertListEqual(monolingual_tokens, EXPECTED_ENG)
+        self.assertListEqual(multilingual_tokens, EXPECTED_MULTI)
+
+    def test_tokenizer_special(self):
+        multilingual_tokenizer = WhisperTokenizer.from_pretrained(
+            "openai/whisper-tiny", language="english", task="transcribe"
+        )
+        text = "Hey! How are you feeling? J'ai l'impression que 郷さん est prêt"
+
+        multilingual_tokens = multilingual_tokenizer.encode(text)
+
+        # fmt: off
+        # format: <|startoftranscript|> <|lang-id|> <|task|> <|notimestamps|> ... transcription ids ... <|endoftext|>
+        EXPECTED_MULTI = [
+            START_OF_TRANSCRIPT, EN_CODE, TRANSCRIBE, NOTIMESTAMPS, 7057, 0, 1012, 366, 291,
+            2633, 30, 508, 6, 1301, 287, 6, 36107, 631, 220, 11178,
+            115, 15567, 871, 44393, END_OF_TRANSCRIPT
+        ]
+        EXPECTED_SPECIAL_TEXT = (
+            "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>Hey! How are you feeling? "
+            "J'ai l'impression que 郷さん est prêt<|endoftext|>"
+        )
+        # fmt: on
+
+        self.assertListEqual(multilingual_tokens, EXPECTED_MULTI)
+
+        special_transcript = multilingual_tokenizer.decode(multilingual_tokens, skip_special_tokens=False)
+        self.assertEqual(special_transcript, EXPECTED_SPECIAL_TEXT)
+
+        transcript = multilingual_tokenizer.decode(multilingual_tokens, skip_special_tokens=True)
+        self.assertEqual(transcript, text)
+
+    def test_vocab_size(self):
+        self.assertEqual(self.tokenizer.vocab_size, 50257)
+
+    # Copied from tests.models.speech_to_text.test_tokenization_speech_to_text.SpeechToTextTokenizerMultilinguialTest.test_tokenizer_decode_ignores_language_codes
+    def test_tokenizer_decode_ignores_language_codes(self):
+        self.assertIn(ES_CODE, self.tokenizer.all_special_ids)
+        generated_ids = [ES_CODE, 4, 1601, 47, 7647, 2]
+        result = self.tokenizer.decode(generated_ids, skip_special_tokens=True)
+        expected_spanish = self.tokenizer.decode(generated_ids[1:], skip_special_tokens=True)
+        self.assertEqual(result, expected_spanish)
+        self.assertNotIn(self.tokenizer.eos_token, result)
+
+    def test_batch_encoding(self):
+        multilingual_tokenizer = WhisperTokenizer.from_pretrained(
+            "openai/whisper-tiny", language="spanish", task="translate"
+        )
+        batch = ["El gato ", "El gato se sentó"]
+        batch_output = multilingual_tokenizer.batch_encode_plus(batch, padding=True).input_ids
+
+        # fmt: off
+        EXPECTED_MULTI = [
+            [START_OF_TRANSCRIPT, ES_CODE, TRANSLATE, NOTIMESTAMPS, 17356, 290, 2513, 220,
+             END_OF_TRANSCRIPT, END_OF_TRANSCRIPT, END_OF_TRANSCRIPT],
+            [START_OF_TRANSCRIPT, ES_CODE, TRANSLATE, NOTIMESTAMPS, 17356, 290, 2513, 369,
+             2279, 812, END_OF_TRANSCRIPT]
+        ]
+        # fmt: on
+
+        self.assertListEqual(batch_output, EXPECTED_MULTI)
+
+    def test_set_prefix_tokens(self):
+        multilingual_tokenizer = WhisperTokenizer.from_pretrained(
+            "openai/whisper-tiny", language="spanish", task="translate"
+        )
+
+        # change the language prefix token from Spanish to English
+        multilingual_tokenizer.set_prefix_tokens(language="english")
+
+        batch = ["the cat", "the cat sat"]
+        batch_output = multilingual_tokenizer.batch_encode_plus(batch, padding=True).input_ids
+
+        # fmt: off
+        EXPECTED_MULTI = [
+            [START_OF_TRANSCRIPT, EN_CODE, TRANSLATE, NOTIMESTAMPS, 3322, 3857,
+             END_OF_TRANSCRIPT, END_OF_TRANSCRIPT],
+            [START_OF_TRANSCRIPT, EN_CODE, TRANSLATE, NOTIMESTAMPS, 3322, 3857,
+             3227, END_OF_TRANSCRIPT]
+        ]
+        # fmt: on
+
+        self.assertListEqual(batch_output, EXPECTED_MULTI)
+
+    def test_batch_encoding_decoding(self):
+        multilingual_tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny", language="spanish")
+        batch = ["hola güey", "que onda"]
+        batch_encoding = multilingual_tokenizer.batch_encode_plus(batch, padding=True).input_ids
+        transcription = multilingual_tokenizer.batch_decode(batch_encoding, skip_special_tokens=True)
+        self.assertListEqual(batch, transcription)
+
+    def test_offset_decoding(self):
+        multilingual_tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny")
+        # fmt: off
+        INPUT_TOKENS = [
+            50258, 50259, 50359, 50364, 441, 1857, 4174, 11, 5242, 366,
+            257, 1333, 295, 493, 2794, 2287, 293, 12018, 14880, 11,
+            293, 25730, 311, 454, 34152, 4496, 904, 50724, 50724, 366,
+            382, 4048, 382, 257, 361, 18459, 13065, 13, 2221, 13,
+            7145, 74, 325, 38756, 311, 29822, 7563, 412, 472, 709,
+            294, 264, 51122, 51122, 912, 636, 300, 2221, 13, 2741,
+            5767, 1143, 281, 7319, 702, 7798, 13, 400, 2221, 13,
+            2619, 4004, 811, 2709, 702, 51449, 51449, 50257
+        ]
+        # fmt: on
+        output = multilingual_tokenizer.decode(INPUT_TOKENS, output_offsets=True)["offsets"]
+
+        self.assertEqual(
+            output,
+            [
+                {
+                    "text": (
+                        " Lennils, pictures are a sort of upguards and atom paintings, and Mason's exquisite idles"
+                    ),
+                    "timestamp": (0.0, 7.2),
+                },
+                {
+                    "text": (
+                        " are as national as a jingo poem. Mr. Birkut Foster's landscapes smile at one much in the"
+                    ),
+                    "timestamp": (7.2, 15.16),
+                },
+                {
+                    "text": " same way that Mr. Carker used to flash his teeth. And Mr. John Colier gives his",
+                    "timestamp": (15.16, 21.7),
+                },
+            ],
+        )
+        # test `decode_with_offsets`
+        output = multilingual_tokenizer.decode(INPUT_TOKENS, decode_with_timestamps=True)
+        self.assertEqual(
+            output,
+            "<|startoftranscript|><|en|><|transcribe|><|0.00|> Lennils, pictures are a sort of upguards and atom"
+            " paintings, and Mason's exquisite idles<|7.20|><|7.20|> are as national as a jingo poem. Mr. Birkut"
+            " Foster's landscapes smile at one much in the<|15.16|><|15.16|> same way that Mr. Carker used to flash"
+            " his teeth. And Mr. John Colier gives his<|21.70|><|21.70|><|endoftext|>",
+        )
+        # test a single sequence with timestamps
+        # fmt: off
+        INPUT_TOKENS = [
+            50364, 441, 1857, 4174, 11, 5242, 366,
+            257, 1333, 295, 493, 2794, 2287, 293, 12018, 14880, 11,
+            293, 25730, 311, 454, 34152, 4496, 904, 50724
+        ]
+        # fmt: on
+
+        output = multilingual_tokenizer.decode(INPUT_TOKENS, output_offsets=True)["offsets"]
+        self.assertEqual(
+            output[0],
+            {
+                "text": " Lennils, pictures are a sort of upguards and atom paintings, and Mason's exquisite idles",
+                "timestamp": (0.0, 7.2),
+            },
+        )
+
+        # test a sequence without a single timestamps
+        # fmt: off
+        INPUT_TOKENS = [
+            441, 1857, 4174, 11, 5242, 366,
+            257, 1333, 295, 493, 2794, 2287, 293, 12018, 14880, 11,
+            293, 25730, 311, 454, 34152, 4496, 904, 50724
+        ]
+        # fmt: on
+
+        output = multilingual_tokenizer.decode(INPUT_TOKENS, output_offsets=True)["offsets"]
+        self.assertEqual(output, [])
+
+    def test_convert_to_list_np(self):
+        test_list = [[1, 2, 3], [4, 5, 6]]
+
+        # Test with an already converted list
+        self.assertListEqual(WhisperTokenizer._convert_to_list(test_list), test_list)
+        self.assertListEqual(WhisperTokenizerFast._convert_to_list(test_list), test_list)
+
+        # Test with a numpy array
+        np_array = np.array(test_list)
+        self.assertListEqual(WhisperTokenizer._convert_to_list(np_array), test_list)
+        self.assertListEqual(WhisperTokenizerFast._convert_to_list(np_array), test_list)
+
+    @require_torch
+    def test_convert_to_list_pt(self):
+        import torch
+
+        test_list = [[1, 2, 3], [4, 5, 6]]
+        torch_tensor = torch.tensor(test_list)
+        self.assertListEqual(WhisperTokenizer._convert_to_list(torch_tensor), test_list)
+        self.assertListEqual(WhisperTokenizerFast._convert_to_list(torch_tensor), test_list)