609 lines
95 KiB
Python
609 lines
95 KiB
Python
|
|
# Copyright 2021 Iz Beltagy, Matthew E. Peters, Arman Cohan and The HuggingFace Inc. team. All rights reserved.
|
||
|
|
#
|
||
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
|
# you may not use this file except in compliance with the License.
|
||
|
|
# You may obtain a copy of the License at
|
||
|
|
#
|
||
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||
|
|
#
|
||
|
|
# Unless required by applicable law or agreed to in writing, software
|
||
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
|
# See the License for the specific language governing permissions and
|
||
|
|
# limitations under the License.
|
||
|
|
"""Testing suite for the PyTorch LED model."""
|
||
|
|
|
||
|
|
import copy
|
||
|
|
import tempfile
|
||
|
|
import unittest
|
||
|
|
from functools import cached_property
|
||
|
|
|
||
|
|
from transformers import LEDConfig, is_torch_available
|
||
|
|
from transformers.models.auto import get_values
|
||
|
|
from transformers.testing_utils import (
|
||
|
|
require_sentencepiece,
|
||
|
|
require_tokenizers,
|
||
|
|
require_torch,
|
||
|
|
require_torch_fp16,
|
||
|
|
slow,
|
||
|
|
torch_device,
|
||
|
|
)
|
||
|
|
|
||
|
|
from ...generation.test_utils import GenerationTesterMixin
|
||
|
|
from ...test_configuration_common import ConfigTester
|
||
|
|
from ...test_modeling_common import ModelTesterMixin, ids_tensor
|
||
|
|
from ...test_pipeline_mixin import PipelineTesterMixin
|
||
|
|
|
||
|
|
|
||
|
|
if is_torch_available():
|
||
|
|
import torch
|
||
|
|
|
||
|
|
from transformers import (
|
||
|
|
MODEL_FOR_QUESTION_ANSWERING_MAPPING,
|
||
|
|
LEDForConditionalGeneration,
|
||
|
|
LEDForQuestionAnswering,
|
||
|
|
LEDForSequenceClassification,
|
||
|
|
LEDModel,
|
||
|
|
LEDTokenizer,
|
||
|
|
)
|
||
|
|
from transformers.models.led.modeling_led import LEDDecoder, LEDEncoder
|
||
|
|
|
||
|
|
|
||
|
|
def prepare_led_inputs_dict(
|
||
|
|
config,
|
||
|
|
input_ids,
|
||
|
|
decoder_input_ids,
|
||
|
|
attention_mask=None,
|
||
|
|
decoder_attention_mask=None,
|
||
|
|
head_mask=None,
|
||
|
|
decoder_head_mask=None,
|
||
|
|
cross_attn_head_mask=None,
|
||
|
|
):
|
||
|
|
if attention_mask is None:
|
||
|
|
attention_mask = input_ids.ne(config.pad_token_id)
|
||
|
|
if decoder_attention_mask is None:
|
||
|
|
decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
|
||
|
|
if head_mask is None:
|
||
|
|
head_mask = torch.ones(config.encoder_layers, config.encoder_attention_heads, device=torch_device)
|
||
|
|
if decoder_head_mask is None:
|
||
|
|
decoder_head_mask = torch.ones(config.decoder_layers, config.decoder_attention_heads, device=torch_device)
|
||
|
|
if cross_attn_head_mask is None:
|
||
|
|
cross_attn_head_mask = torch.ones(config.decoder_layers, config.decoder_attention_heads, device=torch_device)
|
||
|
|
return {
|
||
|
|
"input_ids": input_ids,
|
||
|
|
"decoder_input_ids": decoder_input_ids,
|
||
|
|
"attention_mask": attention_mask,
|
||
|
|
"decoder_attention_mask": decoder_attention_mask,
|
||
|
|
"head_mask": head_mask,
|
||
|
|
"decoder_head_mask": decoder_head_mask,
|
||
|
|
"cross_attn_head_mask": cross_attn_head_mask,
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
class LEDModelTester:
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
parent,
|
||
|
|
batch_size=13,
|
||
|
|
seq_length=11,
|
||
|
|
is_training=True,
|
||
|
|
use_labels=False,
|
||
|
|
vocab_size=99,
|
||
|
|
hidden_size=16,
|
||
|
|
num_hidden_layers=2,
|
||
|
|
num_attention_heads=4,
|
||
|
|
intermediate_size=4,
|
||
|
|
hidden_act="gelu",
|
||
|
|
hidden_dropout_prob=0.1,
|
||
|
|
attention_probs_dropout_prob=0.1,
|
||
|
|
max_position_embeddings=32,
|
||
|
|
eos_token_id=2,
|
||
|
|
pad_token_id=1,
|
||
|
|
bos_token_id=0,
|
||
|
|
attention_window=4,
|
||
|
|
):
|
||
|
|
self.parent = parent
|
||
|
|
self.batch_size = batch_size
|
||
|
|
self.seq_length = seq_length
|
||
|
|
self.is_training = is_training
|
||
|
|
self.use_labels = use_labels
|
||
|
|
self.vocab_size = vocab_size
|
||
|
|
self.hidden_size = hidden_size
|
||
|
|
self.num_hidden_layers = num_hidden_layers
|
||
|
|
self.num_attention_heads = num_attention_heads
|
||
|
|
self.intermediate_size = intermediate_size
|
||
|
|
self.hidden_act = hidden_act
|
||
|
|
self.hidden_dropout_prob = hidden_dropout_prob
|
||
|
|
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
||
|
|
self.max_position_embeddings = max_position_embeddings
|
||
|
|
self.eos_token_id = eos_token_id
|
||
|
|
self.pad_token_id = pad_token_id
|
||
|
|
self.bos_token_id = bos_token_id
|
||
|
|
self.attention_window = attention_window
|
||
|
|
|
||
|
|
# `ModelTesterMixin.test_attention_outputs` is expecting attention tensors to be of size
|
||
|
|
# [num_attention_heads, encoder_seq_length, encoder_key_length], but LongformerSelfAttention
|
||
|
|
# returns attention of shape [num_attention_heads, encoder_seq_length, self.attention_window + 1]
|
||
|
|
# because its local attention only attends to `self.attention_window + 1` locations
|
||
|
|
# (assuming no token with global attention, otherwise the last dimension of attentions
|
||
|
|
# is x + self.attention_window + 1, where x is the number of tokens with global attention)
|
||
|
|
# x is set to 1
|
||
|
|
self.encoder_key_length = self.attention_window + 2
|
||
|
|
|
||
|
|
# because of padding `encoder_seq_length`, is different from `seq_length`. Relevant for
|
||
|
|
# the `test_attention_outputs` and `test_hidden_states_output` tests
|
||
|
|
self.encoder_seq_length = self.seq_length
|
||
|
|
|
||
|
|
def prepare_config_and_inputs(self):
|
||
|
|
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||
|
|
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(
|
||
|
|
3,
|
||
|
|
)
|
||
|
|
input_ids[:, -1] = self.eos_token_id # Eos Token
|
||
|
|
|
||
|
|
decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||
|
|
|
||
|
|
config = self.get_config()
|
||
|
|
inputs_dict = prepare_led_inputs_dict(config, input_ids, decoder_input_ids)
|
||
|
|
return config, inputs_dict
|
||
|
|
|
||
|
|
def get_config(self):
|
||
|
|
return LEDConfig(
|
||
|
|
vocab_size=self.vocab_size,
|
||
|
|
d_model=self.hidden_size,
|
||
|
|
encoder_layers=self.num_hidden_layers,
|
||
|
|
decoder_layers=self.num_hidden_layers,
|
||
|
|
encoder_attention_heads=self.num_attention_heads,
|
||
|
|
decoder_attention_heads=self.num_attention_heads,
|
||
|
|
encoder_ffn_dim=self.intermediate_size,
|
||
|
|
decoder_ffn_dim=self.intermediate_size,
|
||
|
|
dropout=self.hidden_dropout_prob,
|
||
|
|
attention_dropout=self.attention_probs_dropout_prob,
|
||
|
|
max_position_embeddings=self.max_position_embeddings,
|
||
|
|
eos_token_id=self.eos_token_id,
|
||
|
|
bos_token_id=self.bos_token_id,
|
||
|
|
pad_token_id=self.pad_token_id,
|
||
|
|
attention_window=self.attention_window,
|
||
|
|
)
|
||
|
|
|
||
|
|
def get_pipeline_config(self):
|
||
|
|
config = self.get_config()
|
||
|
|
config.max_position_embeddings = 100
|
||
|
|
config.vocab_size = 300
|
||
|
|
return config
|
||
|
|
|
||
|
|
def prepare_config_and_inputs_for_common(self):
|
||
|
|
config, inputs_dict = self.prepare_config_and_inputs()
|
||
|
|
global_attention_mask = torch.zeros_like(inputs_dict["input_ids"])
|
||
|
|
global_attention_mask[:, -1] = 1
|
||
|
|
inputs_dict["global_attention_mask"] = global_attention_mask
|
||
|
|
|
||
|
|
return config, inputs_dict
|
||
|
|
|
||
|
|
def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
|
||
|
|
model = LEDModel(config=config).get_decoder().to(torch_device).eval()
|
||
|
|
input_ids = inputs_dict["input_ids"]
|
||
|
|
attention_mask = inputs_dict["attention_mask"]
|
||
|
|
head_mask = inputs_dict["head_mask"]
|
||
|
|
|
||
|
|
# first forward pass
|
||
|
|
outputs = model(input_ids, attention_mask=attention_mask, head_mask=head_mask, use_cache=True)
|
||
|
|
|
||
|
|
output, past_key_values = outputs.to_tuple()
|
||
|
|
|
||
|
|
# create hypothetical multiple next token and extent to next_input_ids
|
||
|
|
next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
|
||
|
|
next_attn_mask = ids_tensor((self.batch_size, 3), 2)
|
||
|
|
|
||
|
|
# append to next input_ids and
|
||
|
|
next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
|
||
|
|
next_attention_mask = torch.cat([attention_mask, next_attn_mask], dim=-1)
|
||
|
|
|
||
|
|
output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
|
||
|
|
output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
|
||
|
|
"last_hidden_state"
|
||
|
|
]
|
||
|
|
|
||
|
|
# select random slice
|
||
|
|
random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
|
||
|
|
output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
|
||
|
|
output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
|
||
|
|
|
||
|
|
self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
|
||
|
|
|
||
|
|
# test that outputs are equal for slice
|
||
|
|
self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-2))
|
||
|
|
|
||
|
|
def check_encoder_decoder_model_standalone(self, config, inputs_dict):
|
||
|
|
model = LEDModel(config=config).to(torch_device).eval()
|
||
|
|
outputs = model(**inputs_dict)
|
||
|
|
|
||
|
|
encoder_last_hidden_state = outputs.encoder_last_hidden_state
|
||
|
|
last_hidden_state = outputs.last_hidden_state
|
||
|
|
|
||
|
|
with tempfile.TemporaryDirectory() as tmpdirname:
|
||
|
|
encoder = model.get_encoder()
|
||
|
|
encoder.save_pretrained(tmpdirname)
|
||
|
|
encoder = LEDEncoder.from_pretrained(tmpdirname).to(torch_device)
|
||
|
|
|
||
|
|
encoder_last_hidden_state_2 = encoder(
|
||
|
|
inputs_dict["input_ids"],
|
||
|
|
attention_mask=inputs_dict["attention_mask"],
|
||
|
|
global_attention_mask=inputs_dict["global_attention_mask"],
|
||
|
|
)[0]
|
||
|
|
|
||
|
|
self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
|
||
|
|
|
||
|
|
with tempfile.TemporaryDirectory() as tmpdirname:
|
||
|
|
decoder = model.get_decoder()
|
||
|
|
decoder.save_pretrained(tmpdirname)
|
||
|
|
decoder = LEDDecoder.from_pretrained(tmpdirname).to(torch_device)
|
||
|
|
|
||
|
|
last_hidden_state_2 = decoder(
|
||
|
|
input_ids=inputs_dict["decoder_input_ids"],
|
||
|
|
attention_mask=inputs_dict["decoder_attention_mask"],
|
||
|
|
encoder_hidden_states=encoder_last_hidden_state,
|
||
|
|
encoder_attention_mask=inputs_dict["attention_mask"],
|
||
|
|
)[0]
|
||
|
|
|
||
|
|
self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3)
|
||
|
|
|
||
|
|
def check_global_attention(self, config, inputs_dict):
|
||
|
|
model = LEDModel(config=config).to(torch_device).eval()
|
||
|
|
model.config.output_attentions = True
|
||
|
|
attention_mask = ids_tensor(inputs_dict["input_ids"].shape, vocab_size=2)
|
||
|
|
global_attention_mask = torch.zeros_like(attention_mask)
|
||
|
|
|
||
|
|
# set some tokens to global_attention
|
||
|
|
num_tokens_with_global_attention = 2
|
||
|
|
|
||
|
|
attention_mask[:, 2 : 2 + num_tokens_with_global_attention] = 1
|
||
|
|
global_attention_mask[:, 2 : 2 + num_tokens_with_global_attention] = 1
|
||
|
|
inputs_dict["attention_mask"] = attention_mask
|
||
|
|
inputs_dict["global_attention_mask"] = global_attention_mask
|
||
|
|
|
||
|
|
outputs = model(**inputs_dict)
|
||
|
|
self.parent.assertIsNotNone(outputs.encoder_global_attentions)
|
||
|
|
|
||
|
|
# setting `num_tokens_with_global_attention` to global_attentions yields
|
||
|
|
# makes last dim to be of `num_tokens_with_global_attention`
|
||
|
|
self.parent.assertTrue(
|
||
|
|
outputs.encoder_global_attentions[0].shape,
|
||
|
|
(self.batch_size, self.num_attention_heads, self.encoder_seq_length, num_tokens_with_global_attention),
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
@require_torch
|
||
|
|
class LEDModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
||
|
|
all_model_classes = (
|
||
|
|
(LEDModel, LEDForConditionalGeneration, LEDForSequenceClassification, LEDForQuestionAnswering)
|
||
|
|
if is_torch_available()
|
||
|
|
else ()
|
||
|
|
)
|
||
|
|
pipeline_model_mapping = (
|
||
|
|
{
|
||
|
|
"feature-extraction": LEDModel,
|
||
|
|
"question-answering": LEDForQuestionAnswering,
|
||
|
|
"summarization": LEDForConditionalGeneration,
|
||
|
|
"text-classification": LEDForSequenceClassification,
|
||
|
|
"text2text-generation": LEDForConditionalGeneration,
|
||
|
|
"translation": LEDForConditionalGeneration,
|
||
|
|
"zero-shot": LEDForSequenceClassification,
|
||
|
|
}
|
||
|
|
if is_torch_available()
|
||
|
|
else {}
|
||
|
|
)
|
||
|
|
is_encoder_decoder = True
|
||
|
|
test_pruning = False
|
||
|
|
test_missing_keys = False
|
||
|
|
test_torchscript = False
|
||
|
|
|
||
|
|
# TODO: Fix the failed tests when this model gets more usage
|
||
|
|
def is_pipeline_test_to_skip(
|
||
|
|
self,
|
||
|
|
pipeline_test_case_name,
|
||
|
|
config_class,
|
||
|
|
model_architecture,
|
||
|
|
tokenizer_name,
|
||
|
|
image_processor_name,
|
||
|
|
feature_extractor_name,
|
||
|
|
processor_name,
|
||
|
|
):
|
||
|
|
if pipeline_test_case_name == "QAPipelineTests" and not tokenizer_name.endswith("Fast"):
|
||
|
|
return True
|
||
|
|
|
||
|
|
return False
|
||
|
|
|
||
|
|
def setUp(self):
|
||
|
|
self.model_tester = LEDModelTester(self)
|
||
|
|
self.config_tester = ConfigTester(self, config_class=LEDConfig)
|
||
|
|
|
||
|
|
def test_config(self):
|
||
|
|
self.config_tester.run_common_tests()
|
||
|
|
|
||
|
|
def test_save_load_strict(self):
|
||
|
|
config, inputs_dict = self.model_tester.prepare_config_and_inputs()
|
||
|
|
for model_class in self.all_model_classes:
|
||
|
|
model = model_class(config)
|
||
|
|
|
||
|
|
with tempfile.TemporaryDirectory() as tmpdirname:
|
||
|
|
model.save_pretrained(tmpdirname)
|
||
|
|
model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
|
||
|
|
self.assertEqual(info["missing_keys"], [])
|
||
|
|
|
||
|
|
def test_decoder_model_past_with_large_inputs(self):
|
||
|
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||
|
|
self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
|
||
|
|
|
||
|
|
def test_encoder_decoder_model_standalone(self):
|
||
|
|
config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
|
||
|
|
self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
|
||
|
|
|
||
|
|
def test_global_attention(self):
|
||
|
|
config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
|
||
|
|
self.model_tester.check_global_attention(*config_and_inputs)
|
||
|
|
|
||
|
|
def prepare_config_and_inputs_for_generate(self, *args, **kwargs):
|
||
|
|
config, inputs_dict = super().prepare_config_and_inputs_for_generate(*args, **kwargs)
|
||
|
|
# LED computes attention scores based on mask indices if `is_global`
|
||
|
|
inputs_dict.pop("global_attention_mask")
|
||
|
|
return config, inputs_dict
|
||
|
|
|
||
|
|
# LEDForSequenceClassification does not support inputs_embeds
|
||
|
|
def test_inputs_embeds(self):
|
||
|
|
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||
|
|
|
||
|
|
for model_class in (LEDModel, LEDForConditionalGeneration, LEDForQuestionAnswering):
|
||
|
|
model = model_class(config)
|
||
|
|
model.to(torch_device)
|
||
|
|
model.eval()
|
||
|
|
|
||
|
|
inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
|
||
|
|
|
||
|
|
if not self.is_encoder_decoder:
|
||
|
|
input_ids = inputs["input_ids"]
|
||
|
|
del inputs["input_ids"]
|
||
|
|
else:
|
||
|
|
encoder_input_ids = inputs["input_ids"]
|
||
|
|
decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
|
||
|
|
del inputs["input_ids"]
|
||
|
|
inputs.pop("decoder_input_ids", None)
|
||
|
|
|
||
|
|
wte = model.get_input_embeddings()
|
||
|
|
if not self.is_encoder_decoder:
|
||
|
|
inputs["inputs_embeds"] = wte(input_ids)
|
||
|
|
else:
|
||
|
|
inputs["inputs_embeds"] = wte(encoder_input_ids)
|
||
|
|
inputs["decoder_inputs_embeds"] = wte(decoder_input_ids)
|
||
|
|
|
||
|
|
with torch.no_grad():
|
||
|
|
model(**inputs)[0]
|
||
|
|
|
||
|
|
@require_torch_fp16
|
||
|
|
def test_generate_fp16(self):
|
||
|
|
config, input_dict = self.model_tester.prepare_config_and_inputs()
|
||
|
|
input_ids = input_dict["input_ids"]
|
||
|
|
attention_mask = input_ids.ne(1).to(torch_device)
|
||
|
|
model = LEDForConditionalGeneration(config).eval().to(torch_device)
|
||
|
|
model.half()
|
||
|
|
model.generate(input_ids, attention_mask=attention_mask)
|
||
|
|
model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
|
||
|
|
|
||
|
|
@unittest.skip(reason="Longformer cannot keep gradients in attentions or hidden states")
|
||
|
|
def test_retain_grad_hidden_states_attentions(self):
|
||
|
|
return
|
||
|
|
|
||
|
|
def test_attention_outputs(self):
|
||
|
|
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||
|
|
config.return_dict = True
|
||
|
|
|
||
|
|
seq_length = self.model_tester.seq_length
|
||
|
|
encoder_seq_length = self.model_tester.encoder_seq_length
|
||
|
|
encoder_key_length = self.model_tester.encoder_key_length
|
||
|
|
|
||
|
|
for model_class in self.all_model_classes:
|
||
|
|
inputs_dict["output_attentions"] = True
|
||
|
|
inputs_dict["output_hidden_states"] = False
|
||
|
|
config.return_dict = True
|
||
|
|
model = model_class._from_config(config, attn_implementation="eager")
|
||
|
|
config = model.config
|
||
|
|
model.to(torch_device)
|
||
|
|
model.eval()
|
||
|
|
with torch.no_grad():
|
||
|
|
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
|
||
|
|
attentions = outputs.encoder_attentions
|
||
|
|
self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
|
||
|
|
|
||
|
|
# check that output_attentions also work using config
|
||
|
|
del inputs_dict["output_attentions"]
|
||
|
|
config.output_attentions = True
|
||
|
|
model = model_class(config)
|
||
|
|
model.to(torch_device)
|
||
|
|
model.eval()
|
||
|
|
with torch.no_grad():
|
||
|
|
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
|
||
|
|
attentions = outputs.encoder_attentions
|
||
|
|
self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
|
||
|
|
|
||
|
|
self.assertListEqual(
|
||
|
|
list(attentions[0].shape[-3:]),
|
||
|
|
[self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
|
||
|
|
)
|
||
|
|
out_len = len(outputs)
|
||
|
|
|
||
|
|
# global attention outputs are added as well => so +1 here
|
||
|
|
correct_outlen = 6
|
||
|
|
|
||
|
|
# loss is at first position
|
||
|
|
if "labels" in inputs_dict:
|
||
|
|
correct_outlen += 1 # loss is added to beginning
|
||
|
|
# Question Answering model returns start_logits and end_logits
|
||
|
|
if model_class in get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING):
|
||
|
|
correct_outlen += 1 # start_logits and end_logits instead of only 1 output
|
||
|
|
if "past_key_values" in outputs:
|
||
|
|
correct_outlen += 1 # past_key_values have been returned
|
||
|
|
|
||
|
|
self.assertEqual(out_len, correct_outlen)
|
||
|
|
|
||
|
|
# decoder attentions
|
||
|
|
decoder_attentions = outputs.decoder_attentions
|
||
|
|
self.assertIsInstance(decoder_attentions, (list, tuple))
|
||
|
|
self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
|
||
|
|
self.assertListEqual(
|
||
|
|
list(decoder_attentions[0].shape[-3:]),
|
||
|
|
[self.model_tester.num_attention_heads, seq_length, seq_length],
|
||
|
|
)
|
||
|
|
|
||
|
|
# cross attentions
|
||
|
|
cross_attentions = outputs.cross_attentions
|
||
|
|
self.assertIsInstance(cross_attentions, (list, tuple))
|
||
|
|
self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
|
||
|
|
self.assertListEqual(
|
||
|
|
list(cross_attentions[0].shape[-3:]),
|
||
|
|
[
|
||
|
|
self.model_tester.num_attention_heads,
|
||
|
|
seq_length,
|
||
|
|
seq_length,
|
||
|
|
],
|
||
|
|
)
|
||
|
|
|
||
|
|
def _check_encoder_attention_for_generate(self, attentions, batch_size, config, prompt_length):
|
||
|
|
# overwrite because LED does not have (bs, num_heads, seq_len, seq_len) shape
|
||
|
|
encoder_expected_shape = (
|
||
|
|
batch_size,
|
||
|
|
config.num_attention_heads,
|
||
|
|
prompt_length,
|
||
|
|
self.model_tester.attention_window // 2 * 2 + 1,
|
||
|
|
)
|
||
|
|
self.assertIsInstance(attentions, tuple)
|
||
|
|
self.assertListEqual(
|
||
|
|
[layer_attentions.shape for layer_attentions in attentions],
|
||
|
|
[encoder_expected_shape] * len(attentions),
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
def assert_tensors_close(a, b, atol=1e-12, prefix=""):
|
||
|
|
"""If tensors have different shapes, different values or a and b are not both tensors, raise a nice Assertion error."""
|
||
|
|
if a is None and b is None:
|
||
|
|
return True
|
||
|
|
try:
|
||
|
|
if torch.allclose(a, b, atol=atol):
|
||
|
|
return True
|
||
|
|
raise
|
||
|
|
except Exception:
|
||
|
|
pct_different = (torch.gt((a - b).abs(), atol)).float().mean().item()
|
||
|
|
if a.numel() > 100:
|
||
|
|
msg = f"tensor values are {pct_different:.1%} percent different."
|
||
|
|
else:
|
||
|
|
msg = f"{a} != {b}"
|
||
|
|
if prefix:
|
||
|
|
msg = prefix + ": " + msg
|
||
|
|
raise AssertionError(msg)
|
||
|
|
|
||
|
|
|
||
|
|
def _long_tensor(tok_lst):
|
||
|
|
return torch.tensor(tok_lst, dtype=torch.long, device=torch_device)
|
||
|
|
|
||
|
|
|
||
|
|
TOLERANCE = 1e-4
|
||
|
|
|
||
|
|
|
||
|
|
@require_torch
|
||
|
|
@require_sentencepiece
|
||
|
|
@require_tokenizers
|
||
|
|
@slow
|
||
|
|
class LEDModelIntegrationTests(unittest.TestCase):
|
||
|
|
"""All the below results were obtained with the original checkpoints and code
|
||
|
|
base from https://github.com/allenai/longformer.
|
||
|
|
IMPORTANT: Note that the original checkpoints include a `position_embeddings` "hack"
|
||
|
|
and have to be cut to have the correct shape.
|
||
|
|
See: https://github.com/huggingface/transformers/pull/9278#issue-544709661.
|
||
|
|
"""
|
||
|
|
|
||
|
|
@cached_property
|
||
|
|
def default_tokenizer(self):
|
||
|
|
return LEDTokenizer.from_pretrained("allenai/led-base-16384")
|
||
|
|
|
||
|
|
def test_inference_no_head(self):
|
||
|
|
model = LEDModel.from_pretrained("allenai/led-base-16384").to(torch_device)
|
||
|
|
|
||
|
|
# change to intended input
|
||
|
|
input_ids = _long_tensor([512 * [0, 31414, 232, 328, 740, 1140, 12695, 69]])
|
||
|
|
decoder_input_ids = _long_tensor([128 * [0, 31414, 232, 328, 740, 1140, 12695, 69]])
|
||
|
|
inputs_dict = prepare_led_inputs_dict(model.config, input_ids, decoder_input_ids)
|
||
|
|
with torch.no_grad():
|
||
|
|
output = model(**inputs_dict).last_hidden_state
|
||
|
|
expected_shape = torch.Size((1, 1024, 768))
|
||
|
|
self.assertEqual(output.shape, expected_shape)
|
||
|
|
# change to expected output here
|
||
|
|
expected_slice = torch.tensor(
|
||
|
|
[[2.3050, 2.8279, 0.6531], [-1.8457, -0.1455, -3.5661], [-1.0186, 0.4586, -2.2043]], device=torch_device
|
||
|
|
)
|
||
|
|
torch.testing.assert_close(output[:, :3, :3], expected_slice, rtol=TOLERANCE, atol=TOLERANCE)
|
||
|
|
|
||
|
|
def test_inference_head(self):
|
||
|
|
model = LEDForConditionalGeneration.from_pretrained("allenai/led-base-16384").to(torch_device)
|
||
|
|
|
||
|
|
# change to intended input
|
||
|
|
input_ids = _long_tensor([512 * [0, 31414, 232, 328, 740, 1140, 12695, 69]])
|
||
|
|
decoder_input_ids = _long_tensor([128 * [0, 31414, 232, 328, 740, 1140, 12695, 69]])
|
||
|
|
inputs_dict = prepare_led_inputs_dict(model.config, input_ids, decoder_input_ids)
|
||
|
|
with torch.no_grad():
|
||
|
|
output = model(**inputs_dict, use_cache=False).logits
|
||
|
|
expected_shape = torch.Size((1, 1024, model.config.vocab_size))
|
||
|
|
self.assertEqual(output.shape, expected_shape)
|
||
|
|
# change to expected output here
|
||
|
|
expected_slice = torch.tensor(
|
||
|
|
[[33.6507, 6.4572, 16.8089], [5.8739, -2.4238, 11.2902], [-3.2139, -4.3149, 4.2783]], device=torch_device
|
||
|
|
)
|
||
|
|
torch.testing.assert_close(output[:, :3, :3], expected_slice, rtol=TOLERANCE, atol=TOLERANCE)
|
||
|
|
|
||
|
|
def test_seq_to_seq_generation(self):
|
||
|
|
# this test requires 16GB of RAM
|
||
|
|
hf = LEDForConditionalGeneration.from_pretrained("allenai/led-large-16384-arxiv").to(torch_device)
|
||
|
|
tok = LEDTokenizer.from_pretrained("allenai/led-large-16384-arxiv")
|
||
|
|
|
||
|
|
ARTICLE_LEP = r"""the lep experiments at the resonance of @xmath1-boson have tested the standard model ( sm ) at quantum level , measuring the @xmath1-decay into fermion pairs with an accuracy of one part in ten thousands . the good agreement of the lep data with the sm predictions have severely constrained the behavior of new physics at the @xmath1-pole . taking these achievements into account one can imagine that the physics of @xmath1-boson will again play the central role in the frontier of particle physics if the next generation @xmath1 factory comes true with the generated @xmath1 events several orders of magnitude higher than that of the lep . this factory can be realized in the gigaz option of the international linear collider ( ilc)@xcite . the ilc is a proposed electron - positron collider with tunable energy ranging from @xmath12 to @xmath13 and polarized beams in its first phase , and the gigaz option corresponds to its operation on top of the resonance of @xmath1 boson by adding a bypass to its main beam line . given the high luminosity , @xmath14 , and the cross section at the resonance of @xmath1 boson , @xmath15 , about @xmath16 @xmath1 events can be generated in an operational year of @xmath17 of gigaz , which implies that the expected sensitivity to the branching ratio of @xmath1-decay can be improved from @xmath18 at the lep to @xmath19 at the gigaz@xcite . in light of this , the @xmath1-boson properties , especially its exotic or rare decays which are widely believed to be sensitive to new physics , should be investigated comprehensively to evaluate their potential in probing new physics . among the rare @xmath1-decays , the flavor changing ( fc ) processes were most extensively studied to explore the flavor texture in new physics @xcite , and it was found that , although these processes are severely suppressed in the sm , their branching ratios in new physics models can be greatly enhanced to @xmath19 for lepton flavor violation decays @xcite and @xmath20 for quark flavor violation decays @xcite . besides the fc processes , the @xmath1-decay into light higgs boson(s ) is another type of rare process that was widely studied , e.g. the decay @xmath21 ( @xmath22 ) with the particle @xmath0 denoting a light higgs boson was studied in @xcite , the decay @xmath23 was studied in the two higgs doublet model ( 2hdm)@xcite and the minimal supersymmetric standard model ( mssm)@xcite , and the decay @xmath4 was studied in a model independent way @xcite , in 2hdm@xcite and also in mssm@xcite . these studies indicate that , in contrast with the kinematic forbidden of these decays in the sm , the rates of these decays can be as large as @xmath18 in new physics models , which lie within the expected sensitivity of the gigaz . in this work , we extend the previous studies of these decays to some new models and investigate these decays altogether . we are motivated by some recent studies on the singlet extension of the mssm , such as the next - to - minimal supersymmetric standard model ( nmssm ) @xcite and the nearly minimal supersymmetric standard model ( nmssm ) @xcite , where a light cp - odd higgs boson @xmath0 with singlet - dominant component may naturally arise from the spontaneous breaking of some approximate global symmetry like @xmath24 or peccei - quuin symmetry @xcite . these non - minimal supersymmetric models can not only avoid the @xmath25-problem , but also alleviate the little hierarchy by having such a light higgs boson @xmath0 @xcite . we are also motivated by that , with the latest experiments , the properties of the light higgs boson are more stringently constrained than before . so it is worth updating the previous studies . so far there is no model - independent lower bound on the lightest higgs boson mass . in the sm , it must be heavier than @xmath26 gev , obtained from the null observation of the higgs boson at lep experiments . however , due to the more complex structure of the higgs sector in the extensions of the sm , this lower bound can be significantly relaxed according to
|
||
|
|
|
||
|
|
ARTICLE_MAGNET = r"""it is well known that the classical magnetoresistance ( mr ) in metals or semiconductors with a closed free electron fermi surface increases quadratically with increasing magnetic field @xmath2 for @xmath3 and saturates when @xmath4 . here @xmath5 is the zero - magnetic - field mobility . hence , the extraordinarily high and linear mr ( lmr ) , which breaks this familiar rule , has been gaining much attention as soon as its discovery . in the past decade , this unexpected lmr has been reported in silver chalcogenide,@xcite indium antimonide,@xcite silicon,@xcite mnas - gaas composite material,@xcite and graphene.@xcite kapitza s linear law@xcite indicates that the metal shows a magnetoresistance linear in perpendicular magnetic field when it has an open fermi surface and a mean free path longer than the electronic larmor radius . recently , another two models , irrespective of the open fermi surface , have been constructed to provide possible mechanisms for the lmr phenomenon . abrikosov suggested a quantum - limit origin of lmr for the homogenous system with a gapless linear energy spectrum.@xcite his model requires that landau levels are well formed and the carrier concentration is small that all electrons occupy only the lowest landau band . alternatively , parish and littlewood developed a classical model without involving linear spectrum.@xcite ignoring the concrete microscopic mechanism , they attributed this unusual mr to the mobility fluctuations in a strongly inhomogenous system . topological insulators@xcite ( tis ) are novel materials with a full energy gap in bulk , while there are gapless surface states . due to its unique band structure with only one helical dirac cone and linear energy dispersion,@xcite the surface states of the ti bi@xmath0se@xmath1 become an excellent platform for the study of quantum - limit lmr . the recent experiment in this flat surface system , however , reported that a large positive mr , which becomes very linear above a characteristic field of @xmath6@xmath7@xmath8 t , was observed even in an opposite situation where the carrier sheet density is high that electrons occupy more than one landau levels.@xcite moreover , they found that raising temperature to room temperature almost has no influence on the observed lmr . it is striking that this observation is in conflict with abrikosov s model and also with the classical parish - littlewood model . so far a reliable theoretical scheme capable of explaining this novel experiment has still been lacking . in this paper , we generalize the balance - equation approach@xcite to a system modeling the surface states of a three - dimensional ti to investigate the two - dimensional magnetotransport in it . we find that a positive , nonsaturating and dominantly linear magnetoresistance can appear within quite wide magnetic - field range in the ti surface state having a positive and finite effective g - factor . this linear magnetoresistance shows up in the system of high carrier concentration and low mobility when electrons are in extended states and spread over many smeared landau levels , and persists up to room temperature , providing a possible mechanism for the recently observed linear magnetoresistance in topological insulator bi@xmath0se@xmath1 nanoribbons.@xcite we consider the surface state of a bi@xmath0se@xmath1-type large bulk gap ti in the @xmath9-@xmath10 plane under the influence of a uniform magnetic field @xmath11 applied along the @xmath12 direction.@xcite following the experimental observation,@xcite we assume that the fermi energy locates in the gap of the bulk band and above the dirac point , i.e. the surface carriers are electrons . further , the separations of the fermi energy from the bottom of bulk band and dirac point are much larger than the highest temperature ( @xmath13 ) considered in this work . hence , the contribution from the bulk band to the magnetotransport is negligible . these electrons , scattered by randomly distributed impurities and by phonons , are driven by a unifo
|
||
|
|
|
||
|
|
dct = tok.batch_encode_plus(
|
||
|
|
[ARTICLE_LEP, ARTICLE_MAGNET],
|
||
|
|
max_length=6144,
|
||
|
|
padding="max_length",
|
||
|
|
truncation=True,
|
||
|
|
return_tensors="pt",
|
||
|
|
)
|
||
|
|
|
||
|
|
hypotheses_batch = hf.generate(
|
||
|
|
input_ids=dct["input_ids"].to(torch_device),
|
||
|
|
attention_mask=dct["attention_mask"].to(torch_device),
|
||
|
|
num_beams=4,
|
||
|
|
max_length=512,
|
||
|
|
early_stopping=True,
|
||
|
|
no_repeat_ngram_size=3,
|
||
|
|
)
|
||
|
|
|
||
|
|
EXPECTED_LEP = (
|
||
|
|
" the physics of @xmath0-boson will again play the central role in the frontier of particle physics if the"
|
||
|
|
" gigaz option of the international linear collider ( ilc ) can be realized in its first phase. \n the"
|
||
|
|
" expected sensitivity to the branching ratio of rare decays, especially its exotic or rare processes,"
|
||
|
|
" should be investigated comprehensively to evaluate their potential in probing new physics. in this work"
|
||
|
|
" \n, we study the rare decay into light higgs boson(s ) in the framework of the minimal supersymmetric"
|
||
|
|
" standard model ( mssm ), where a light cp - odd higgs - boson with singlet - dominant component may"
|
||
|
|
" naturally arise from the spontaneous breaking of some approximate global symmetry. "
|
||
|
|
)
|
||
|
|
|
||
|
|
EXPECTED_MAGNET = (
|
||
|
|
" the recent experiment in the surface states of the topological insulator bi@xmath0se @xmath1, however,"
|
||
|
|
" reported that a large positive magnetoresistance becomes very linear in perpendicular magnetic field"
|
||
|
|
" even in an opposite situation where the carrier sheet density is high that all electrons occupy more"
|
||
|
|
" than one landau levels. \n it is striking that this observation is in conflict with abrikosov s model"
|
||
|
|
" and also with the classical parish - littlewood model. "
|
||
|
|
)
|
||
|
|
|
||
|
|
generated = tok.batch_decode(
|
||
|
|
hypotheses_batch.tolist(), clean_up_tokenization_spaces=True, skip_special_tokens=True
|
||
|
|
)
|
||
|
|
assert generated == [EXPECTED_LEP, EXPECTED_MAGNET]
|