This commit is contained in:
2025-10-09 16:47:16 +08:00
parent c8feb4deb5
commit e27e3f16bb
5248 changed files with 1778505 additions and 0 deletions

View File

@@ -0,0 +1,240 @@
# Copyright 2023 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import itertools
import os
import random
import tempfile
import unittest
import numpy as np
from datasets import Audio, load_dataset
from transformers import ClvpFeatureExtractor
from transformers.testing_utils import (
check_json_file_has_correct_format,
cleanup,
require_torch,
slow,
torch_device,
)
from transformers.utils.import_utils import is_torch_available
from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
if is_torch_available():
import torch
global_rng = random.Random()
# Copied from transformers.tests.models.whisper.test_feature_extraction_whisper.floats_list
def floats_list(shape, scale=1.0, rng=None, name=None):
"""Creates a random float32 tensor"""
if rng is None:
rng = global_rng
values = []
for batch_idx in range(shape[0]):
values.append([])
for _ in range(shape[1]):
values[-1].append(rng.random() * scale)
return values
@require_torch
class ClvpFeatureExtractionTester:
def __init__(
self,
parent,
batch_size=7,
min_seq_length=400,
max_seq_length=2000,
feature_size=10,
hop_length=160,
chunk_length=8,
padding_value=0.0,
sampling_rate=4_000,
return_attention_mask=False,
):
self.parent = parent
self.batch_size = batch_size
self.min_seq_length = min_seq_length
self.max_seq_length = max_seq_length
self.seq_length_diff = (self.max_seq_length - self.min_seq_length) // (self.batch_size - 1)
self.padding_value = padding_value
self.sampling_rate = sampling_rate
self.return_attention_mask = return_attention_mask
self.feature_size = feature_size
self.chunk_length = chunk_length
self.hop_length = hop_length
def prepare_feat_extract_dict(self):
return {
"feature_size": self.feature_size,
"hop_length": self.hop_length,
"chunk_length": self.chunk_length,
"padding_value": self.padding_value,
"sampling_rate": self.sampling_rate,
"return_attention_mask": self.return_attention_mask,
}
# Copied from transformers.tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTester.prepare_inputs_for_common
def prepare_inputs_for_common(self, equal_length=False, numpify=False):
def _flatten(list_of_lists):
return list(itertools.chain(*list_of_lists))
if equal_length:
speech_inputs = [floats_list((self.max_seq_length, self.feature_size)) for _ in range(self.batch_size)]
else:
# make sure that inputs increase in size
speech_inputs = [
floats_list((x, self.feature_size))
for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff)
]
if numpify:
speech_inputs = [np.asarray(x) for x in speech_inputs]
return speech_inputs
@require_torch
class ClvpFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
feature_extraction_class = ClvpFeatureExtractor
def setUp(self):
self.feat_extract_tester = ClvpFeatureExtractionTester(self)
def tearDown(self):
super().tearDown()
# clean-up as much as possible GPU memory occupied by PyTorch
cleanup(torch_device)
# Copied from transformers.tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTest.test_feat_extract_from_and_save_pretrained
def test_feat_extract_from_and_save_pretrained(self):
feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
with tempfile.TemporaryDirectory() as tmpdirname:
saved_file = feat_extract_first.save_pretrained(tmpdirname)[0]
check_json_file_has_correct_format(saved_file)
feat_extract_second = self.feature_extraction_class.from_pretrained(tmpdirname)
dict_first = feat_extract_first.to_dict()
dict_second = feat_extract_second.to_dict()
mel_1 = feat_extract_first.mel_filters
mel_2 = feat_extract_second.mel_filters
self.assertTrue(np.allclose(mel_1, mel_2))
self.assertEqual(dict_first, dict_second)
# Copied from transformers.tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTest.test_feat_extract_to_json_file
def test_feat_extract_to_json_file(self):
feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
with tempfile.TemporaryDirectory() as tmpdirname:
json_file_path = os.path.join(tmpdirname, "feat_extract.json")
feat_extract_first.to_json_file(json_file_path)
feat_extract_second = self.feature_extraction_class.from_json_file(json_file_path)
dict_first = feat_extract_first.to_dict()
dict_second = feat_extract_second.to_dict()
mel_1 = feat_extract_first.mel_filters
mel_2 = feat_extract_second.mel_filters
self.assertTrue(np.allclose(mel_1, mel_2))
self.assertEqual(dict_first, dict_second)
def test_call(self):
# Tests that all call wrap to encode_plus and batch_encode_plus
feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
# create three inputs of length 800, 1000, and 1200
speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
# Test feature size
input_features = feature_extractor(np_speech_inputs, padding="max_length", return_tensors="np").input_features
self.assertTrue(input_features.ndim == 3)
self.assertTrue(input_features.shape[-2] == feature_extractor.feature_size)
# Test not batched input
encoded_sequences_1 = feature_extractor(speech_inputs[0], return_tensors="np").input_features
encoded_sequences_2 = feature_extractor(np_speech_inputs[0], return_tensors="np").input_features
self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3))
# Test batched
encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features
encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
# Test 2-D numpy arrays are batched.
speech_inputs = [floats_list((1, x))[0] for x in (800, 800, 800)]
np_speech_inputs = np.asarray(speech_inputs)
encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features
encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
# Test truncation required
speech_inputs = [floats_list((1, x))[0] for x in range(200, (feature_extractor.n_samples + 500), 200)]
np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
speech_inputs_truncated = [x[: feature_extractor.n_samples] for x in speech_inputs]
np_speech_inputs_truncated = [np.asarray(speech_input) for speech_input in speech_inputs_truncated]
encoded_sequences_1 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
encoded_sequences_2 = feature_extractor(np_speech_inputs_truncated, return_tensors="np").input_features
for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
# Copied from transformers.tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTest.test_double_precision_pad
def test_double_precision_pad(self):
import torch
feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
np_speech_inputs = np.random.rand(100, 32).astype(np.float64)
py_speech_inputs = np_speech_inputs.tolist()
for inputs in [py_speech_inputs, np_speech_inputs]:
np_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="np")
self.assertTrue(np_processed.input_features.dtype == np.float32)
pt_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="pt")
self.assertTrue(pt_processed.input_features.dtype == torch.float32)
def _load_datasamples(self, num_samples):
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
ds = ds.cast_column("audio", Audio(sampling_rate=22050))
# automatic decoding with librispeech
speech_samples = ds.sort("id")[:num_samples]["audio"]
return [x["array"] for x in speech_samples], [x["sampling_rate"] for x in speech_samples]
@slow
def test_integration(self):
# fmt: off
EXPECTED_INPUT_FEATURES = torch.tensor(
[
0.9271, 1.1405, 1.4419, 1.2470, 1.2438, 1.1787, 1.0595, 1.0570, 1.1070,
1.2205, 1.2376, 1.2997, 1.1131, 1.0843, 1.0459, 1.1858, 1.2323, 1.3582,
1.3401, 1.3770, 1.4173, 1.3381, 1.2291, 1.0854, 1.2116, 1.1873, 1.2178,
1.2137, 1.3001, 1.4274
]
)
# fmt: on
input_speech, sr = self._load_datasamples(1)
feature_extractor = ClvpFeatureExtractor.from_pretrained("susnato/clvp_dev")
input_features = feature_extractor(input_speech, sampling_rate=sr[0], return_tensors="pt").input_features
self.assertEqual(input_features.shape, (1, 80, 517))
torch.testing.assert_close(input_features[0, 0, :30], EXPECTED_INPUT_FEATURES, rtol=1e-4, atol=1e-4)

View File

@@ -0,0 +1,643 @@
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Testing suite for the PyTorch Clvp model."""
import tempfile
import unittest
import datasets
import numpy as np
from transformers import ClvpConfig, ClvpDecoderConfig, ClvpEncoderConfig
from transformers.testing_utils import (
cleanup,
require_torch,
slow,
torch_device,
)
from transformers.utils import is_torch_available
from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import (
ModelTesterMixin,
_config_zero_init,
ids_tensor,
random_attention_mask,
)
from ...test_pipeline_mixin import PipelineTesterMixin
if is_torch_available():
import torch
from transformers import ClvpEncoder, ClvpForCausalLM, ClvpModel, ClvpModelForConditionalGeneration
from transformers import ClvpFeatureExtractor, ClvpTokenizer
class ClvpEncoderTester:
def __init__(
self,
parent,
batch_size=2,
seq_length=7,
is_training=False,
use_input_mask=True,
use_labels=True,
vocab_size=50,
hidden_size=128,
projection_dim=16,
num_hidden_layers=2,
num_attention_heads=4,
intermediate_size=32,
dropout=0.1,
attention_dropout=0.1,
initializer_range=0.02,
scope=None,
):
self.parent = parent
self.batch_size = batch_size
self.seq_length = seq_length
self.is_training = is_training
self.use_input_mask = use_input_mask
self.use_labels = use_labels
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.projection_dim = projection_dim
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.dropout = dropout
self.attention_dropout = attention_dropout
self.initializer_range = initializer_range
self.scope = scope
self.bos_token_id = vocab_size - 1
self.eos_token_id = vocab_size - 1
def get_config(self):
encoder_config = ClvpEncoderConfig(
vocab_size=self.vocab_size,
hidden_size=self.hidden_size,
projection_dim=self.projection_dim,
num_hidden_layers=self.num_hidden_layers,
num_attention_heads=self.num_attention_heads,
intermediate_size=self.intermediate_size,
dropout=self.dropout,
attention_dropout=self.attention_dropout,
initializer_range=self.initializer_range,
bos_token_id=self.bos_token_id,
eos_token_id=self.eos_token_id,
)
return encoder_config
def prepare_config_and_inputs(self):
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
input_mask = None
if self.use_input_mask:
input_mask = random_attention_mask([self.batch_size, self.seq_length])
if input_mask is not None:
batch_size, seq_length = input_mask.shape
rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
for batch_idx, start_index in enumerate(rnd_start_indices):
input_mask[batch_idx, :start_index] = 1
input_mask[batch_idx, start_index:] = 0
encoder_config = self.get_config()
return encoder_config, input_ids, input_mask
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
speech_config, input_ids, input_mask = config_and_inputs
inputs_dict = {"input_ids": input_ids.to(torch_device), "attention_mask": input_mask.to(torch_device)}
return speech_config, inputs_dict
def create_and_check_model(self, speech_config, input_ids, input_mask):
text_config = ClvpEncoderConfig(
vocab_size=self.vocab_size,
hidden_size=self.hidden_size,
projection_dim=self.projection_dim,
num_hidden_layers=self.num_hidden_layers,
num_attention_heads=self.num_attention_heads,
intermediate_size=self.intermediate_size,
dropout=self.dropout,
attention_dropout=self.attention_dropout,
initializer_range=self.initializer_range,
)
text_encoder_model = ClvpEncoder(config=text_config)
text_encoder_model.to(torch_device)
text_encoder_model.eval()
with torch.no_grad():
result = text_encoder_model(input_ids, attention_mask=input_mask)
result = text_encoder_model(input_ids)
self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
self.parent.assertEqual(result[0].shape, (self.batch_size, self.projection_dim))
# now check with speech config
speech_encoder_model = ClvpEncoder(config=speech_config)
speech_encoder_model.to(torch_device)
speech_encoder_model.eval()
with torch.no_grad():
result = speech_encoder_model(input_ids, attention_mask=input_mask)
result = speech_encoder_model(input_ids)
self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
self.parent.assertEqual(result[0].shape, (self.batch_size, self.projection_dim))
@require_torch
class ClvpEncoderTest(ModelTesterMixin, unittest.TestCase):
all_model_classes = (ClvpEncoder,) if is_torch_available() else ()
test_pruning = False
test_head_masking = False
test_torchscript = False
def setUp(self):
self.model_tester = ClvpEncoderTester(self)
self.encoder_config_tester = ConfigTester(self, config_class=ClvpEncoderConfig, hidden_size=32)
def tearDown(self):
super().tearDown()
# clean-up as much as possible GPU memory occupied by PyTorch
cleanup(torch_device)
def test_config(self):
self.encoder_config_tester.run_common_tests()
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
@unittest.skip(reason="ClvpEncoder does not output loss")
def test_training(self):
pass
@unittest.skip(reason="ClvpEncoder does not output loss")
def test_training_gradient_checkpointing(self):
pass
class ClvpDecoderTester:
def __init__(
self,
parent,
batch_size=2,
seq_length=3,
is_training=False,
vocab_size=300,
max_position_embeddings=256,
max_text_tokens=256,
use_input_mask=True,
hidden_size=128,
num_hidden_layers=2,
num_attention_heads=2,
bos_token_id=97,
eos_token_id=98,
relative_attention_num_buckets=4,
relative_attention_max_distance=16,
):
self.parent = parent
self.batch_size = batch_size
self.seq_length = seq_length
self.is_training = is_training
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.max_text_tokens = max_text_tokens
self.use_input_mask = use_input_mask
self.hidden_size = hidden_size
self.num_attention_heads = num_attention_heads
self.num_hidden_layers = num_hidden_layers
self.bos_token_id = bos_token_id
self.eos_token_id = eos_token_id
self.relative_attention_num_buckets = relative_attention_num_buckets
self.relative_attention_max_distance = relative_attention_max_distance
def get_config(self):
decoder_config = ClvpDecoderConfig(
vocab_size=self.vocab_size,
max_position_embeddings=self.max_position_embeddings,
max_text_tokens=self.max_text_tokens,
hidden_size=self.hidden_size,
num_hidden_layers=self.num_hidden_layers,
num_attention_heads=self.num_attention_heads,
bos_token_id=self.bos_token_id,
eos_token_id=self.eos_token_id,
relative_attention_num_buckets=self.relative_attention_num_buckets,
relative_attention_max_distance=self.relative_attention_max_distance,
)
return decoder_config
def prepare_config_and_inputs(self):
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
input_mask = None
if self.use_input_mask:
input_mask = random_attention_mask([self.batch_size, self.seq_length])
if input_mask is not None:
batch_size, seq_length = input_mask.shape
rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
for batch_idx, start_index in enumerate(rnd_start_indices):
input_mask[batch_idx, :start_index] = 1
input_mask[batch_idx, start_index:] = 0
decoder_config = self.get_config()
return decoder_config, input_ids, input_mask
def create_and_check_model(self, config, input_ids, attention_mask):
model = ClvpForCausalLM(config).to(torch_device).eval()
with torch.no_grad():
result = model(input_ids=input_ids, attention_mask=attention_mask)
self.parent.assertEqual(result[0].shape, (self.batch_size, self.seq_length, self.vocab_size))
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
config, input_ids, attention_mask = config_and_inputs
inputs_dict = {
"input_ids": input_ids.to(torch_device),
"attention_mask": attention_mask.to(torch_device),
}
return config, inputs_dict
@require_torch
class ClvpDecoderTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
all_model_classes = (ClvpModel, ClvpForCausalLM) if is_torch_available() else ()
pipeline_model_mapping = {"feature-extraction": ClvpModelForConditionalGeneration} if is_torch_available() else {}
test_pruning = False
def setUp(self):
self.model_tester = ClvpDecoderTester(self)
self.decoder_config_tester = ConfigTester(self, config_class=ClvpDecoderConfig, hidden_size=32)
def tearDown(self):
super().tearDown()
# clean-up as much as possible GPU memory occupied by PyTorch
cleanup(torch_device)
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
if return_labels and model_class == ClvpForCausalLM:
inputs_dict["labels"] = torch.zeros(
[self.model_tester.batch_size, self.model_tester.seq_length], device=torch_device
).long()
return inputs_dict
def test_training(self):
# we will only test the ClvpForCausalLM since it outputs loss
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
config.return_dict = True
model = ClvpForCausalLM(config)
model.to(torch_device)
model.train()
inputs = self._prepare_for_class(inputs_dict, ClvpForCausalLM, return_labels=True)
loss = model(**inputs).loss
loss.backward()
def test_training_gradient_checkpointing(self):
# we will only test the ClvpForCausalLM since it outputs loss
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
config.use_cache = False
config.return_dict = True
model = ClvpForCausalLM(config)
model.to(torch_device)
model.gradient_checkpointing_enable()
model.train()
inputs = self._prepare_for_class(inputs_dict, ClvpForCausalLM, return_labels=True)
loss = model(**inputs).loss
loss.backward()
@unittest.skip(reason="Clvp `prepare_inputs_for_generation` function doesn't have cache position.")
def test_generate_continue_from_inputs_embeds(self):
pass
class ClvpModelForConditionalGenerationTester:
def __init__(self, parent, is_training=False):
self.parent = parent
self.clvp_encoder_tester = ClvpEncoderTester(parent)
self.is_training = is_training
self.batch_size = self.clvp_encoder_tester.batch_size # need bs for batching_equivalence test
def get_config(self):
decoder_config = ClvpDecoderConfig(
vocab_size=50,
max_position_embeddings=30,
max_text_tokens=30,
hidden_size=128,
num_hidden_layers=1,
num_attention_heads=2,
bos_token_id=97,
eos_token_id=98,
relative_attention_num_buckets=4,
relative_attention_max_distance=16,
)
text_config = self.clvp_encoder_tester.get_config()
speech_config = self.clvp_encoder_tester.get_config()
speech_config.vocab_size = 300
return ClvpConfig.from_sub_model_configs(
text_config,
speech_config,
decoder_config,
projection_dim=16,
)
def prepare_config_and_inputs(self):
_, input_ids, attention_mask = self.clvp_encoder_tester.prepare_config_and_inputs()
ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050))
audio = ds.sort("id")[0]["audio"]
audio_sample = audio["array"]
sr = audio["sampling_rate"]
feature_extractor = ClvpFeatureExtractor()
input_features = feature_extractor(raw_speech=audio_sample, sampling_rate=sr, return_tensors="pt")[
"input_features"
].to(torch_device)
config = self.get_config()
return config, input_ids, attention_mask, input_features
def create_and_check_model(self, config, input_ids, attention_mask, input_features):
model = ClvpModelForConditionalGeneration(config).to(torch_device).eval()
with torch.no_grad():
result = model(input_ids=input_ids, input_features=input_features, attention_mask=attention_mask)
self.parent.assertEqual(result.logits_per_speech.shape, (2, self.clvp_encoder_tester.batch_size))
self.parent.assertEqual(result.logits_per_text.shape, (self.clvp_encoder_tester.batch_size, 2))
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
config, input_ids, attention_mask, input_features = config_and_inputs
inputs_dict = {
"input_ids": input_ids.to(torch_device),
"attention_mask": attention_mask.to(torch_device),
"input_features": input_features.to(torch_device),
"return_loss": False,
}
return config, inputs_dict
@require_torch
class ClvpModelForConditionalGenerationTest(ModelTesterMixin, unittest.TestCase):
all_model_classes = (ClvpModelForConditionalGeneration,) if is_torch_available() else ()
# Doesn't run generation tests. There are interface mismatches when using `generate` -- TODO @gante
all_generative_model_classes = ()
test_head_masking = False
test_pruning = False
test_resize_embeddings = False
test_attention_outputs = False
test_torchscript = False
def setUp(self):
self.model_tester = ClvpModelForConditionalGenerationTester(self)
common_properties = ["projection_dim", "logit_scale_init_value"]
self.clvp_config_tester = ConfigTester(
self, config_class=ClvpConfig, has_text_modality=False, common_properties=common_properties, hidden_size=32
)
def test_config(self):
self.clvp_config_tester.run_common_tests()
def tearDown(self):
super().tearDown()
# clean-up as much as possible GPU memory occupied by PyTorch
cleanup(torch_device)
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
def test_hidden_states_output(self):
def check_hidden_states_output(inputs_dict, config, model_class):
model = model_class(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
# check for decoder model, text encoder model and speech encoder model hidden states
decoder_hidden_states = outputs.decoder_hidden_states
text_encoder_hidden_states = outputs.text_encoder_hidden_states
speech_encoder_hidden_states = outputs.speech_encoder_hidden_states
# check length of the hidden states
expected_decoder_num_layers = config.decoder_config.num_hidden_layers + 1
self.assertEqual(len(decoder_hidden_states), expected_decoder_num_layers)
expected_speech_encoder_num_layers = config.text_config.num_hidden_layers + 1
self.assertEqual(len(text_encoder_hidden_states), expected_speech_encoder_num_layers)
expected_text_encoder_num_layers = config.speech_config.num_hidden_layers + 1
self.assertEqual(len(speech_encoder_hidden_states), expected_text_encoder_num_layers)
# check shapes of each hidden state
# for the decoder model we will only test the dimension because the ClvpConditioningEncoder could increase
# the sequence lengths.
self.assertEqual(decoder_hidden_states[0].shape[-1], config.decoder_config.hidden_size)
# the testing for text encoder stays standard because we just pass the text tokens here.
self.assertListEqual(
list(text_encoder_hidden_states[0].shape[-2:]),
[self.model_tester.clvp_encoder_tester.seq_length, config.text_config.hidden_size],
)
# for the decoder model we will only test the dimension because the fix_decoder_outputs method could increase
# the sequence lengths by adding `decoder_fixing_codes` tokens at the end.
self.assertEqual(speech_encoder_hidden_states[0].shape[-1], config.speech_config.hidden_size)
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
inputs_dict["output_hidden_states"] = True
check_hidden_states_output(inputs_dict, config, model_class)
# check that output_hidden_states also work using config
del inputs_dict["output_hidden_states"]
config.output_hidden_states = True
check_hidden_states_output(inputs_dict, config, model_class)
@unittest.skip(reason="Retain_grad is tested in individual model tests")
def test_retain_grad_hidden_states_attentions(self):
pass
@unittest.skip(reason="ClvpModelForConditionalGeneration does not have get_input_embeddings")
def test_inputs_embeds(self):
pass
@unittest.skip(reason="ClvpModelForConditionalGeneration does not have get_input_embeddings")
def test_model_get_set_embeddings(self):
pass
# override as the `logit_scale` parameter initialization is different for Clvp
def test_initialization(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
configs_no_init = _config_zero_init(config)
for model_class in self.all_model_classes:
model = model_class(config=configs_no_init)
for name, param in model.named_parameters():
if param.requires_grad:
# check if `logit_scale` is initialized as per the original implementation
if name == "logit_scale":
expected_value = np.log(1 / 0.07)
returned_value = param.data.item()
self.assertAlmostEqual(
returned_value,
expected_value,
delta=1e-3,
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
)
else:
expected_range = [0.0, 1.0]
returned_range = ((param.data.mean() * 1e9).round() / 1e9).item()
self.assertIn(
returned_range,
expected_range,
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
)
def test_load_speech_text_decoder_config(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
# Save ClvpConfig and check if we can load ClvpEncoderConfig from it
with tempfile.TemporaryDirectory() as tmp_dir_name:
config.save_pretrained(tmp_dir_name)
encoder_config = ClvpEncoderConfig.from_pretrained(tmp_dir_name)
self.assertDictEqual(config.text_config.to_dict(), encoder_config.to_dict())
# Save ClvpConfig and check if we can load ClvpDecoderConfig from it
with tempfile.TemporaryDirectory() as tmp_dir_name:
config.save_pretrained(tmp_dir_name)
decoder_config = ClvpDecoderConfig.from_pretrained(tmp_dir_name)
self.assertDictEqual(config.decoder_config.to_dict(), decoder_config.to_dict())
@slow
def test_model_from_pretrained(self):
model_name = "susnato/clvp_dev"
model = ClvpModelForConditionalGeneration.from_pretrained(model_name)
self.assertIsNotNone(model)
# Since Clvp has a lot of different models connected with each other it's better to test each of them individually along
# with a test_full_model_integration. If the model breaks in future, it could be of a great help to identify the broken part.
@slow
@require_torch
class ClvpIntegrationTest(unittest.TestCase):
def setUp(self):
self.text = "This is an example text."
ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050))
audio = ds.sort("id")["audio"][0]
self.speech_samples, self.sr = audio["array"], audio["sampling_rate"]
self.model = ClvpModelForConditionalGeneration.from_pretrained("susnato/clvp_dev").to(torch_device)
self.model.eval()
tokenizer = ClvpTokenizer.from_pretrained("susnato/clvp_dev")
feature_extractor = ClvpFeatureExtractor.from_pretrained("susnato/clvp_dev")
tokenizer_output = tokenizer(self.text, return_tensors="pt")
self.text_tokens = tokenizer_output["input_ids"].to(torch_device)
self.input_features = feature_extractor(
raw_speech=self.speech_samples, sampling_rate=self.sr, return_tensors="pt"
)["input_features"].to(torch_device)
def tearDown(self):
super().tearDown()
# clean-up as much as possible GPU memory occupied by PyTorch
cleanup(torch_device, gc_collect=True)
def test_conditional_encoder(self):
with torch.no_grad():
conditioning_encoder_outputs = self.model.conditioning_encoder(
input_features=self.input_features, input_ids=self.text_tokens
).to("cpu")
self.assertEqual(
conditioning_encoder_outputs.shape,
torch.Size((self.input_features.shape[0], 18, self.model.config.decoder_config.hidden_size)),
)
EXPECTED_OUTPUTS = torch.tensor(
[[-0.8582, 0.5228, 1.9944], [-0.0465, -1.1017, -0.0093], [-0.0466, -0.6030, -0.1280]]
)
torch.testing.assert_close(conditioning_encoder_outputs[0, :3, :3], EXPECTED_OUTPUTS, rtol=1e-4, atol=1e-4)
def test_decoder_model_generate(self):
autoregressive_model_output = self.model.speech_decoder_model.generate(input_ids=self.text_tokens).cpu()
EXPECTED_OUTPUTS = torch.tensor([[147, 2, 54, 2, 43, 2, 169, 122, 29, 64, 2, 136, 37, 33, 9, 8193]])
torch.testing.assert_close(autoregressive_model_output, EXPECTED_OUTPUTS)
def test_text_and_speech_encoder_models(self):
# check for text embeds
text_embeds = self.model.text_encoder_model(input_ids=self.text_tokens, return_dict=True)[0].cpu()
# fmt: off
EXPECTED_TEXT_EMBEDS = torch.tensor([1.4798, -2.0005, 2.3902, -0.5042, 1.6401, -2.4135, -1.4800, 3.0118, -2.4422, 1.3266, 2.2339, 1.4761, -4.8983, -1.3592, 6.0251, 6.7364, 2.2576, 3.7229, -10.0436, 4.6676])
# fmt: on
torch.testing.assert_close(text_embeds[0, :20], EXPECTED_TEXT_EMBEDS, rtol=1e-4, atol=1e-4)
# check for speech embeds
speech_embeds = self.model.speech_encoder_model(input_ids=self.text_tokens, return_dict=True)[0].cpu()
# fmt: off
EXPECTED_SPEECH_EMBEDS = torch.tensor([3.1202, -3.1183, -1.4264, -6.1339, 1.8885, -0.1983, 0.9461, -1.7414, 0.3320, -3.8400, -1.5715, 1.5096, -1.7576, 0.2387, 4.9758, 5.8450, -6.2534, 2.8587, -5.5816, 4.7821])
# fmt: on
torch.testing.assert_close(speech_embeds[0, :20], EXPECTED_SPEECH_EMBEDS, rtol=1e-4, atol=1e-4)
def test_full_model_integration(self):
full_model_output = self.model.generate(
input_ids=self.text_tokens,
input_features=self.input_features,
do_sample=False,
num_beams=4,
num_return_sequences=4,
max_new_tokens=10,
)
EXPECTED_SPEECH_IDS = torch.tensor([[1953, 1080, 612], [1953, 612, 493], [1953, 612, 716]])
EXPECTED_SIMILARITY_SCORES = torch.tensor([[14.7660, 14.4569, 13.6472, 13.5683]])
torch.testing.assert_close(full_model_output.speech_ids.cpu()[-3:, -3:], EXPECTED_SPEECH_IDS)
torch.testing.assert_close(full_model_output.logits_per_text.cpu(), EXPECTED_SIMILARITY_SCORES)

View File

@@ -0,0 +1,124 @@
# Copyright 2023 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import shutil
import tempfile
import unittest
from transformers import ClvpFeatureExtractor, ClvpProcessor, ClvpTokenizer
from transformers.testing_utils import require_torch
from .test_feature_extraction_clvp import floats_list
@require_torch
class ClvpProcessorTest(unittest.TestCase):
def setUp(self):
self.checkpoint = "susnato/clvp_dev"
self.tmpdirname = tempfile.mkdtemp()
def tearDown(self):
super().tearDown()
shutil.rmtree(self.tmpdirname)
gc.collect()
# Copied from transformers.tests.models.whisper.test_processing_whisper.WhisperProcessorTest.get_tokenizer with Whisper->Clvp
def get_tokenizer(self, **kwargs):
return ClvpTokenizer.from_pretrained(self.checkpoint, **kwargs)
# Copied from transformers.tests.models.whisper.test_processing_whisper.WhisperProcessorTest.get_feature_extractor with Whisper->Clvp
def get_feature_extractor(self, **kwargs):
return ClvpFeatureExtractor.from_pretrained(self.checkpoint, **kwargs)
# Copied from transformers.tests.models.whisper.test_processing_whisper.WhisperProcessorTest.test_save_load_pretrained_default with Whisper->Clvp
def test_save_load_pretrained_default(self):
tokenizer = self.get_tokenizer()
feature_extractor = self.get_feature_extractor()
processor = ClvpProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
processor.save_pretrained(self.tmpdirname)
processor = ClvpProcessor.from_pretrained(self.tmpdirname)
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
self.assertIsInstance(processor.tokenizer, ClvpTokenizer)
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
self.assertIsInstance(processor.feature_extractor, ClvpFeatureExtractor)
# Copied from transformers.tests.models.whisper.test_processing_whisper.WhisperProcessorTest.test_feature_extractor with Whisper->Clvp,processor(raw_speech->processor(raw_speech=raw_speech
def test_feature_extractor(self):
feature_extractor = self.get_feature_extractor()
tokenizer = self.get_tokenizer()
processor = ClvpProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
raw_speech = floats_list((3, 1000))
input_feat_extract = feature_extractor(raw_speech, return_tensors="np")
input_processor = processor(raw_speech=raw_speech, return_tensors="np")
for key in input_feat_extract:
self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
# Copied from transformers.tests.models.whisper.test_processing_whisper.WhisperProcessorTest.test_tokenizer with Whisper->Clvp
def test_tokenizer(self):
feature_extractor = self.get_feature_extractor()
tokenizer = self.get_tokenizer()
processor = ClvpProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
input_str = "This is a test string"
encoded_processor = processor(text=input_str)
encoded_tok = tokenizer(input_str)
for key in encoded_tok:
self.assertListEqual(encoded_tok[key], encoded_processor[key])
# Copied from transformers.tests.models.whisper.test_processing_whisper.WhisperProcessorTest.test_tokenizer_decode with Whisper->Clvp
def test_tokenizer_decode(self):
feature_extractor = self.get_feature_extractor()
tokenizer = self.get_tokenizer()
processor = ClvpProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
decoded_processor = processor.batch_decode(predicted_ids)
decoded_tok = tokenizer.batch_decode(predicted_ids)
self.assertListEqual(decoded_tok, decoded_processor)
def test_save_load_pretrained_additional_features(self):
processor = ClvpProcessor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor())
processor.save_pretrained(self.tmpdirname)
tokenizer_add_kwargs = self.get_tokenizer(pad_token="(PAD)")
feature_extractor_add_kwargs = self.get_feature_extractor(sampling_rate=16000)
processor = ClvpProcessor.from_pretrained(
self.tmpdirname,
pad_token="(PAD)",
sampling_rate=16000,
)
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
self.assertIsInstance(processor.tokenizer, ClvpTokenizer)
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
self.assertIsInstance(processor.feature_extractor, ClvpFeatureExtractor)

View File

@@ -0,0 +1,314 @@
# Copyright 2023 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
import unittest
from transformers import ClvpTokenizer
from ...test_tokenization_common import TokenizerTesterMixin, slow
class ClvpTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "susnato/clvp_dev"
tokenizer_class = ClvpTokenizer
test_rust_tokenizer = False
from_pretrained_kwargs = {"add_prefix_space": True}
test_seq2seq = False
test_sentencepiece_ignore_case = True
@classmethod
def setUpClass(cls):
super().setUpClass()
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
vocab = [
"l",
"o",
"w",
"e",
"r",
"s",
"t",
"i",
"d",
"n",
"\u0120",
"\u0120l",
"\u0120n",
"\u0120lo",
"\u0120low",
"er",
"\u0120lowest",
"\u0120newer",
"\u0120wider",
"<unk>",
"<|endoftext|>",
"[SPACE]",
]
vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
cls.special_tokens_map = {"unk_token": "<unk>"}
cls.vocab_file = os.path.join(cls.tmpdirname, "vocab.json")
cls.merges_file = os.path.join(cls.tmpdirname, "merges.txt")
with open(cls.vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens) + "\n")
with open(cls.merges_file, "w", encoding="utf-8") as fp:
fp.write("\n".join(merges))
# Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.get_tokenizer with GPT2->Clvp
@classmethod
def get_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return ClvpTokenizer.from_pretrained(pretrained_name, **kwargs)
# Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.get_input_output_texts
def get_input_output_texts(self, tokenizer):
input_text = "lower newer"
output_text = "lower[SPACE]newer"
return input_text, output_text
# Copied from transformers.tests.models.layoutxlm.test_tokenization_layoutxlm.LayoutXLMTokenizationTest.test_add_special_tokens
def test_add_special_tokens(self):
tokenizers: list[ClvpTokenizer] = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
special_token = "[SPECIAL_TOKEN]"
special_token_box = [1000, 1000, 1000, 1000]
tokenizer.add_special_tokens({"cls_token": special_token})
encoded_special_token = tokenizer.encode(
[special_token], boxes=[special_token_box], add_special_tokens=False
)
self.assertEqual(len(encoded_special_token), 1)
decoded = tokenizer.decode(encoded_special_token, skip_special_tokens=True)
self.assertTrue(special_token not in decoded)
# Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.test_rust_and_python_full_tokenizers
def test_rust_and_python_full_tokenizers(self):
if not self.test_rust_tokenizer:
self.skipTest(reason="test_rust_tokenizer is set to False")
tokenizer = self.get_tokenizer()
rust_tokenizer = self.get_rust_tokenizer(add_prefix_space=True)
sequence = "lower newer"
# Testing tokenization
tokens = tokenizer.tokenize(sequence, add_prefix_space=True)
rust_tokens = rust_tokenizer.tokenize(sequence)
self.assertListEqual(tokens, rust_tokens)
# Testing conversion to ids without special tokens
ids = tokenizer.encode(sequence, add_special_tokens=False, add_prefix_space=True)
rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
self.assertListEqual(ids, rust_ids)
# Testing conversion to ids with special tokens
rust_tokenizer = self.get_rust_tokenizer(add_prefix_space=True)
ids = tokenizer.encode(sequence, add_prefix_space=True)
rust_ids = rust_tokenizer.encode(sequence)
self.assertListEqual(ids, rust_ids)
# Testing the unknown token
input_tokens = tokens + [rust_tokenizer.unk_token]
input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
self.assertListEqual(rust_tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
# Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.test_padding
def test_padding(self, max_length=15):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
# Simple input
s = "This is a simple input"
s2 = ["This is a simple input 1", "This is a simple input 2"]
p = ("This is a simple input", "This is a pair")
p2 = [
("This is a simple input 1", "This is a simple input 2"),
("This is a simple pair 1", "This is a simple pair 2"),
]
# Simple input tests
self.assertRaises(ValueError, tokenizer_r.encode, s, max_length=max_length, padding="max_length")
# Simple input
self.assertRaises(ValueError, tokenizer_r.encode_plus, s, max_length=max_length, padding="max_length")
# Simple input
self.assertRaises(
ValueError,
tokenizer_r.batch_encode_plus,
s2,
max_length=max_length,
padding="max_length",
)
# Pair input
self.assertRaises(ValueError, tokenizer_r.encode, p, max_length=max_length, padding="max_length")
# Pair input
self.assertRaises(ValueError, tokenizer_r.encode_plus, p, max_length=max_length, padding="max_length")
# Pair input
self.assertRaises(
ValueError,
tokenizer_r.batch_encode_plus,
p2,
max_length=max_length,
padding="max_length",
)
# Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.test_padding_if_pad_token_set_slow
def test_padding_if_pad_token_set_slow(self):
tokenizer = ClvpTokenizer.from_pretrained(self.tmpdirname, pad_token="<pad>")
# Simple input
s = "This is a simple input"
s2 = ["This is a simple input looooooooong", "This is a simple input"]
p = ("This is a simple input", "This is a pair")
p2 = [
("This is a simple input loooooong", "This is a simple input"),
("This is a simple pair loooooong", "This is a simple pair"),
]
pad_token_id = tokenizer.pad_token_id
out_s = tokenizer(s, padding="max_length", max_length=30, return_tensors="np")
out_s2 = tokenizer(s2, padding=True, truncate=True, return_tensors="np")
out_p = tokenizer(*p, padding="max_length", max_length=60, return_tensors="np")
out_p2 = tokenizer(p2, padding=True, truncate=True, return_tensors="np")
# s
# test single string max_length padding
self.assertEqual(out_s["input_ids"].shape[-1], 30)
self.assertTrue(pad_token_id in out_s["input_ids"])
self.assertTrue(0 in out_s["attention_mask"])
# s2
# test automatic padding
self.assertEqual(out_s2["input_ids"].shape[-1], 33)
# long slice doesn't have padding
self.assertFalse(pad_token_id in out_s2["input_ids"][0])
self.assertFalse(0 in out_s2["attention_mask"][0])
# short slice does have padding
self.assertTrue(pad_token_id in out_s2["input_ids"][1])
self.assertTrue(0 in out_s2["attention_mask"][1])
# p
# test single pair max_length padding
self.assertEqual(out_p["input_ids"].shape[-1], 60)
self.assertTrue(pad_token_id in out_p["input_ids"])
self.assertTrue(0 in out_p["attention_mask"])
# p2
# test automatic padding pair
self.assertEqual(out_p2["input_ids"].shape[-1], 52)
# long slice pair doesn't have padding
self.assertFalse(pad_token_id in out_p2["input_ids"][0])
self.assertFalse(0 in out_p2["attention_mask"][0])
# short slice pair does have padding
self.assertTrue(pad_token_id in out_p2["input_ids"][1])
self.assertTrue(0 in out_p2["attention_mask"][1])
# Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.test_special_tokens_mask_input_pairs_and_bos_token
def test_special_tokens_mask_input_pairs_and_bos_token(self):
# TODO: change to self.get_tokenizers() when the fast version is implemented
tokenizers = [self.get_tokenizer(do_lower_case=False, add_bos_token=True)]
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
sequence_0 = "Encode this."
sequence_1 = "This one too please."
encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False)
encoded_sequence += tokenizer.encode(sequence_1, add_special_tokens=False)
encoded_sequence_dict = tokenizer.encode_plus(
sequence_0,
sequence_1,
add_special_tokens=True,
return_special_tokens_mask=True,
)
encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
filtered_sequence = [
(x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)
]
filtered_sequence = [x for x in filtered_sequence if x is not None]
self.assertEqual(encoded_sequence, filtered_sequence)
def test_token_type_ids(self):
tokenizer = self.get_tokenizer()
seq_0 = "Test this method."
# We want to have sequence 0 and sequence 1 are tagged
# respectively with 0 and 1 token_ids
# (regardless of whether the model use token type ids)
# We use this assumption in the QA pipeline among other place
output = tokenizer(seq_0, return_token_type_ids=True, add_special_tokens=True)
self.assertIn(0, output["token_type_ids"])
def test_full_tokenizer(self):
tokenizer = ClvpTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
text = "lower newer"
bpe_tokens = ["l", "o", "w", "er", "[SPACE]", "n", "e", "w", "er"]
tokens = tokenizer.tokenize(text, add_prefix_space=False)
self.assertListEqual(tokens, bpe_tokens)
input_tokens = tokens + [tokenizer.unk_token]
input_bpe_tokens = [0, 1, 2, 15, 21, 9, 3, 2, 15, 19]
self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
@slow
def test_outputs_with_numbers(self):
text = "hello and this is an example text and I have $1000. my lucky number is 12345."
tokenizer = ClvpTokenizer.from_pretrained("susnato/clvp_dev")
# fmt: off
EXPECTED_OUTPUT = [62, 84, 28, 2, 53, 2,147, 2, 54, 2, 43, 2, 169, 122, 29, 64, 2, 136, 37, 33, 2, 53, 2, 22,
2, 148, 2, 110, 2, 40, 206, 53, 2, 134, 84, 59, 32, 9, 2, 125, 2, 25, 34, 197, 38, 2, 27,
231, 15, 44, 2, 54, 2, 33, 100, 25, 76, 2, 40, 206, 53, 7, 2, 40, 46, 18, 2, 21, 97, 17,
219, 2, 87, 210, 8, 19, 22, 76, 9,
]
# fmt: on
self.assertListEqual(tokenizer.encode(text, add_special_tokens=False), EXPECTED_OUTPUT)
@slow
def test_tokenizer_integration(self):
sequences = [
"Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides "
"general-purpose architectures (BERT, RoBERTa, XLM, DistilBert, XLNet...) for Natural "
"Language Understanding (NLU) and Natural Language Generation (NLG) with over multiple pretrained "
"models and deep interoperability between Jax, PyTorch and TensorFlow.",
"BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly "
"conditioning on both left and right context in all layers.",
"The quick brown fox jumps over the lazy dog.",
]
# fmt: off
expected_encoding = {'input_ids': [[144, 43, 32, 87, 26, 173, 2, 5, 87, 26, 44, 70, 2, 209, 27, 2, 55, 2, 29, 38, 51, 31, 71, 8, 144, 43, 32, 87, 26, 173, 2, 53, 2, 29, 38, 51, 31, 71, 8, 29, 46, 144, 137, 49, 8, 15, 44, 33, 6, 2, 187, 35, 83, 61, 2, 20, 50, 44, 56, 8, 29, 121, 139, 66, 2, 59, 71, 60, 18, 16, 33, 34, 175, 2, 5, 15, 44, 33, 7, 2, 89, 15, 44, 33, 14, 7, 2, 37, 25, 26, 7, 2, 17, 54, 78, 25, 15, 44, 33, 7, 2, 37, 25, 111, 33, 9, 9, 9, 6, 2, 87, 2, 27, 48, 121, 56, 2, 25, 43, 20, 34, 14, 112, 2, 97, 234, 63, 53, 52, 2, 5, 27, 25, 34, 6, 2, 53, 2, 27, 48, 121, 56, 2, 25, 43, 20, 34, 14, 112, 2, 20, 50, 44, 158, 2, 5, 27, 25, 20, 6, 2, 103, 2, 253, 2, 26, 167, 78, 29, 64, 2, 29, 46, 144, 137, 49, 2, 115, 126, 25, 32, 2, 53, 2, 126, 18, 29, 2, 41, 114, 161, 44, 109, 151, 240, 2, 67, 33, 100, 50, 2, 23, 14, 37, 7, 2, 29, 38, 51, 31, 71, 2, 53, 2, 33, 50, 32, 57, 19, 25, 69, 9], [ 15, 44, 33, 2, 54, 2, 17, 61, 22, 20, 27, 49, 2, 51, 2, 29, 46, 8, 144, 137, 2, 126, 18, 29, 2, 15, 83, 22, 46, 16, 181, 56, 2, 46, 29, 175, 86, 158, 32, 2, 154, 2, 97, 25, 14, 67, 25, 49, 2, 136, 37, 33, 2, 185, 2, 23, 28, 41, 33, 70, 2, 135, 17, 60, 107, 52, 2, 47, 2, 165, 40, 2, 64, 19, 33, 2, 53, 2, 101, 104, 2, 135, 136, 37, 33, 2, 41, 2, 108, 2, 25, 88, 173, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [ 42, 2, 194, 91, 24, 2, 243, 190, 2, 182, 37, 2, 23, 231, 29, 32, 2, 253, 2, 42, 2, 25, 14, 39, 38, 2, 134, 20, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], # noqa: E501
'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], # noqa: E501
}
# fmt: on
self.tokenizer_integration_test_util(
sequences=sequences, expected_encoding=expected_encoding, model_name="susnato/clvp_dev", padding=True
)