Files
enginex-mlu370-any2any/transformers/tests/models/dia/test_processing_dia.py
2025-10-09 16:47:16 +08:00

261 lines
11 KiB
Python

# Copyright 2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import shutil
import tempfile
import unittest
import numpy as np
from parameterized import parameterized
from transformers import DacModel, DiaFeatureExtractor, DiaProcessor, DiaTokenizer
from transformers.testing_utils import require_torch
from transformers.utils import is_torch_available
if is_torch_available:
import torch
# Copied from tests.utils.test_modeling_utils.check_models_equal
def check_models_equal(model1, model2):
models_are_equal = True
for model1_p, model2_p in zip(model1.parameters(), model2.parameters()):
if model1_p.data.ne(model2_p.data).sum() > 0:
models_are_equal = False
return models_are_equal
@require_torch
class DiaProcessorTest(unittest.TestCase):
def setUp(self):
self.checkpoint = "AntonV/Dia-1.6B"
self.audio_tokenizer_checkpoint = "descript/dac_44khz"
self.tmpdirname = tempfile.mkdtemp()
# Audio tokenizer is a bigger model so we will reuse this if possible
self.processor = DiaProcessor(
tokenizer=self.get_tokenizer(),
feature_extractor=self.get_feature_extractor(),
audio_tokenizer=self.get_audio_tokenizer(),
)
# Default audio values based on Dia and Dac
self.pad_id = 1025
self.bos_id = 1026
self.dac_chunk_len = 512
self.delay_pattern = [0, 8, 9, 10, 11, 12, 13, 14, 15]
def get_tokenizer(self, **kwargs):
return DiaTokenizer.from_pretrained(self.checkpoint, **kwargs)
def get_feature_extractor(self, **kwargs):
return DiaFeatureExtractor.from_pretrained(self.checkpoint, **kwargs)
def get_audio_tokenizer(self, **kwargs):
return DacModel.from_pretrained(self.audio_tokenizer_checkpoint, **kwargs)
def tearDown(self):
shutil.rmtree(self.tmpdirname)
del self.processor
def test_save_load_pretrained_default(self):
tokenizer = self.get_tokenizer()
feature_extractor = self.get_feature_extractor()
audio_tokenizer = self.get_audio_tokenizer()
processor = DiaProcessor(
tokenizer=tokenizer, feature_extractor=feature_extractor, audio_tokenizer=audio_tokenizer
)
processor.save_pretrained(self.tmpdirname)
processor = DiaProcessor.from_pretrained(self.tmpdirname)
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
self.assertIsInstance(processor.tokenizer, DiaTokenizer)
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
self.assertIsInstance(processor.feature_extractor, DiaFeatureExtractor)
self.assertEqual(processor.audio_tokenizer.__class__.__name__, audio_tokenizer.__class__.__name__)
self.assertEqual(processor.audio_tokenizer.name_or_path, audio_tokenizer.name_or_path)
self.assertTrue(check_models_equal(processor.audio_tokenizer, audio_tokenizer))
self.assertIsInstance(processor.audio_tokenizer, DacModel)
def test_save_load_pretrained_additional_features(self):
processor = DiaProcessor(
tokenizer=self.get_tokenizer(),
feature_extractor=self.get_feature_extractor(),
audio_tokenizer=self.get_audio_tokenizer(),
)
processor.save_pretrained(self.tmpdirname)
tokenizer_add_kwargs = self.get_tokenizer()
feature_extractor_add_kwargs = self.get_feature_extractor()
audio_tokenizer_add_kwargs = self.get_audio_tokenizer()
processor = DiaProcessor.from_pretrained(self.tmpdirname)
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
self.assertIsInstance(processor.tokenizer, DiaTokenizer)
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
self.assertIsInstance(processor.feature_extractor, DiaFeatureExtractor)
self.assertEqual(processor.audio_tokenizer.__class__.__name__, audio_tokenizer_add_kwargs.__class__.__name__)
self.assertEqual(processor.audio_tokenizer.name_or_path, audio_tokenizer_add_kwargs.name_or_path)
self.assertTrue(check_models_equal(processor.audio_tokenizer, audio_tokenizer_add_kwargs))
self.assertIsInstance(processor.audio_tokenizer, DacModel)
def test_tokenize(self):
tokenizer = self.get_tokenizer()
random_text = ["This is a processing test for tokenization", "[S1] Dia template style [S2] Nice"]
input_tokenizer = tokenizer(random_text, padding=True, return_tensors="pt")
input_processor = self.processor(random_text)
for key in input_tokenizer:
self.assertTrue((input_tokenizer[key] == input_processor[key]).all())
def test_no_audio(self):
random_text = ["Dummy Input"] * 2
input_processor = self.processor(random_text)
audio_tokens, audio_mask = input_processor["decoder_input_ids"], input_processor["decoder_attention_mask"]
# full mask with +1 for bos
self.assertTrue(audio_mask.sum() == (max(self.delay_pattern) + 1) * len(random_text))
self.assertTrue(
audio_tokens.shape
== (
len(random_text),
max(self.delay_pattern) + 1,
len(self.delay_pattern),
)
)
for channel_idx, delay in enumerate(self.delay_pattern):
expected_sequence = torch.ones(size=(audio_tokens.shape[:-1])) * self.pad_id
expected_sequence[:, : delay + 1] = self.bos_id
self.assertTrue((audio_tokens[..., channel_idx] == expected_sequence).all())
def test_audio(self):
audio_tokenizer = self.get_audio_tokenizer()
feature_extractor = self.get_feature_extractor()
random_text = ["Dummy Input"] * 2
# Dac only starts accepting audio from a certain length (ensured via >=1024)
raw_speeches = [np.random.rand(2048).astype(np.float32), np.random.rand(1024).astype(np.float32)]
input_processor = self.processor(random_text, raw_speeches)
audio_tokens, audio_mask = input_processor["decoder_input_ids"], input_processor["decoder_attention_mask"]
sequence_len = audio_mask.shape[1]
for batch_idx, speech in enumerate(raw_speeches):
raw_audio = feature_extractor(speech, return_tensors="pt")["input_values"]
codebooks = audio_tokenizer(raw_audio).audio_codes.transpose(1, 2)
pad_len = sequence_len - audio_mask.sum(dim=-1)[batch_idx]
for channel_idx, delay in enumerate(self.delay_pattern):
# Left padding filled bos, right padding (delay) are pad
start_idx = pad_len + delay + 1
end_idx = start_idx + codebooks.shape[1]
encoded_sequence = audio_tokens[batch_idx, :, channel_idx]
expected_sequence = torch.ones(size=(sequence_len,)) * self.pad_id
expected_sequence[:start_idx] = self.bos_id
expected_sequence[start_idx:end_idx] = codebooks[0, :, channel_idx]
self.assertTrue((encoded_sequence == expected_sequence).all())
# Just to make sure the masking correctly only ignores bos tokens
self.assertTrue((audio_tokens[~audio_mask.bool()] == self.bos_id).all())
@parameterized.expand([([1, 1],), ([1, 5],), ([2, 4, 6],)])
def test_decode_audio(self, audio_lens):
feature_extractor = self.get_feature_extractor()
audio_tokenizer = self.get_audio_tokenizer()
random_text = ["Dummy Input"] * len(audio_lens)
raw_speeches = [np.random.rand(self.dac_chunk_len * l).astype(np.float32) for l in audio_lens]
# we need eos (given if training) to decode properly, also enforced via custom logits processor
input_processor = self.processor(random_text, raw_speeches, generation=False)
audio_tokens = input_processor["decoder_input_ids"]
decoded_speeches = self.processor.batch_decode(audio_tokens)
for batch_idx, speech in enumerate(raw_speeches):
raw_audio = feature_extractor(speech, return_tensors="pt")["input_values"]
codebooks = audio_tokenizer(raw_audio).audio_codes
decoded_audio = decoded_speeches[batch_idx]
expected_audio = audio_tokenizer.decode(audio_codes=codebooks).audio_values
self.assertTrue((expected_audio == decoded_audio).all())
self.assertTrue(decoded_speeches[batch_idx].shape[-1] == audio_lens[batch_idx] * self.dac_chunk_len)
@parameterized.expand([(1, 2, [0, 1, 4]), (2, 4, [1, 3, 2]), (4, 8, [0, 5, 7])])
def test_delay_in_audio(self, bsz, seq_len, delay_pattern):
# static functions which are crucial, hence we also test them here
build_indices_fn = DiaProcessor.build_indices
delay_fn = DiaProcessor.apply_audio_delay
bos, pad = -2, -1
num_channels = len(delay_pattern)
audio_input = torch.arange(bsz * seq_len * num_channels).view(bsz, seq_len, num_channels)
# imitate a delay mask with zeroes
audio_input = torch.cat([audio_input, torch.zeros(size=(bsz, max(delay_pattern), num_channels))], dim=1)
precomputed_idx = build_indices_fn(
bsz=bsz,
seq_len=seq_len + max(delay_pattern),
num_channels=num_channels,
delay_pattern=delay_pattern,
revert=False,
)
delayed_audio_out = delay_fn(
audio=audio_input,
pad_token_id=pad,
bos_token_id=bos,
precomputed_idx=precomputed_idx,
)
# every channel idx is shifted by delay_pattern[idx]
delayed_audio_res = audio_input.clone()
for idx, delay in enumerate(delay_pattern):
delayed_audio_res[:, :delay, idx] = bos
remaining_input = seq_len + max(delay_pattern) - delay
delayed_audio_res[:, delay:, idx] = audio_input[:, :remaining_input, idx]
self.assertTrue((delayed_audio_out == delayed_audio_res).all())
# we should get back to the original audio we had (when removing the delay pad)
bsz, new_seq_len, num_channels = delayed_audio_out.shape
precomputed_idx = build_indices_fn(
bsz=bsz,
seq_len=new_seq_len,
num_channels=num_channels,
delay_pattern=delay_pattern,
revert=True,
)
reverted_audio_out = delay_fn(
audio=delayed_audio_out,
pad_token_id=pad,
bos_token_id=bos,
precomputed_idx=precomputed_idx,
)
reverted_audio_res = audio_input.clone()[:, :seq_len]
self.assertTrue((reverted_audio_out[:, :seq_len] == reverted_audio_res).all())