init
This commit is contained in:
0
transformers/tests/models/deepseek_vl/__init__.py
Normal file
0
transformers/tests/models/deepseek_vl/__init__.py
Normal file
@@ -0,0 +1,152 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import unittest
|
||||
|
||||
from transformers.testing_utils import require_torch, require_vision
|
||||
from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
|
||||
|
||||
from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
|
||||
if is_vision_available():
|
||||
from transformers import DeepseekVLImageProcessor
|
||||
|
||||
if is_torchvision_available():
|
||||
from transformers import DeepseekVLImageProcessorFast
|
||||
|
||||
|
||||
# Copied from tests.models.vit.test_image_processing_vit.ViTImageProcessingTester with ViT->DeepseekVL
|
||||
class DeepseekVLImageProcessingTester:
|
||||
def __init__(
|
||||
self,
|
||||
parent,
|
||||
batch_size=7,
|
||||
num_channels=3,
|
||||
image_size=18,
|
||||
min_resolution=30,
|
||||
max_resolution=400,
|
||||
do_resize=True,
|
||||
size=None,
|
||||
do_normalize=True,
|
||||
image_mean=[0.5, 0.5, 0.5],
|
||||
image_std=[0.5, 0.5, 0.5],
|
||||
):
|
||||
size = size if size is not None else {"height": 18, "width": 18}
|
||||
self.parent = parent
|
||||
self.batch_size = batch_size
|
||||
self.num_channels = num_channels
|
||||
self.image_size = image_size
|
||||
self.min_resolution = min_resolution
|
||||
self.max_resolution = max_resolution
|
||||
self.do_resize = do_resize
|
||||
self.size = size
|
||||
self.do_normalize = do_normalize
|
||||
self.image_mean = image_mean
|
||||
self.image_std = image_std
|
||||
|
||||
def prepare_image_processor_dict(self):
|
||||
return {
|
||||
"image_mean": self.image_mean,
|
||||
"image_std": self.image_std,
|
||||
"do_normalize": self.do_normalize,
|
||||
"do_resize": self.do_resize,
|
||||
"size": self.size,
|
||||
}
|
||||
|
||||
# Ignore copy
|
||||
def expected_output_image_shape(self, images):
|
||||
max_size = max(self.size["height"], self.size["width"])
|
||||
return self.num_channels, max_size, max_size
|
||||
|
||||
def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
|
||||
return prepare_image_inputs(
|
||||
batch_size=self.batch_size,
|
||||
num_channels=self.num_channels,
|
||||
min_resolution=self.min_resolution,
|
||||
max_resolution=self.max_resolution,
|
||||
equal_resolution=equal_resolution,
|
||||
numpify=numpify,
|
||||
torchify=torchify,
|
||||
)
|
||||
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
class DeepseekVLImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
image_processing_class = DeepseekVLImageProcessor if is_vision_available() else None
|
||||
fast_image_processing_class = DeepseekVLImageProcessorFast if is_torchvision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = DeepseekVLImageProcessingTester(self)
|
||||
|
||||
@property
|
||||
def image_processor_dict(self):
|
||||
return self.image_processor_tester.prepare_image_processor_dict()
|
||||
|
||||
def test_image_processor_properties(self):
|
||||
for image_processing_class in self.image_processor_list:
|
||||
image_processing = image_processing_class(**self.image_processor_dict)
|
||||
self.assertTrue(hasattr(image_processing, "image_mean"))
|
||||
self.assertTrue(hasattr(image_processing, "image_std"))
|
||||
self.assertTrue(hasattr(image_processing, "do_normalize"))
|
||||
self.assertTrue(hasattr(image_processing, "do_resize"))
|
||||
self.assertTrue(hasattr(image_processing, "size"))
|
||||
|
||||
def test_image_processor_from_dict_with_kwargs(self):
|
||||
for image_processing_class in self.image_processor_list:
|
||||
image_processor = image_processing_class.from_dict(self.image_processor_dict)
|
||||
self.assertEqual(image_processor.size, {"height": 18, "width": 18})
|
||||
|
||||
image_processor = image_processing_class.from_dict(self.image_processor_dict, size=42)
|
||||
self.assertEqual(image_processor.size, {"height": 42, "width": 42})
|
||||
|
||||
@require_vision
|
||||
@require_torch
|
||||
def test_slow_fast_equivalence_batched(self):
|
||||
if not self.test_slow_image_processor or not self.test_fast_image_processor:
|
||||
self.skipTest(reason="Skipping slow/fast equivalence test")
|
||||
|
||||
if self.image_processing_class is None or self.fast_image_processing_class is None:
|
||||
self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined")
|
||||
|
||||
if hasattr(self.image_processor_tester, "do_center_crop") and self.image_processor_tester.do_center_crop:
|
||||
self.skipTest(
|
||||
reason="Skipping as do_center_crop is True and center_crop functions are not equivalent for fast and slow processors"
|
||||
)
|
||||
|
||||
dummy_images = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
|
||||
image_processor_slow = self.image_processing_class(**self.image_processor_dict)
|
||||
image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
|
||||
|
||||
encoding_slow = image_processor_slow(dummy_images, return_tensors=None)
|
||||
encoding_fast = image_processor_fast(dummy_images, return_tensors=None)
|
||||
|
||||
# Overwrite as the outputs are not always all of the same shape (kept for BC)
|
||||
for i in range(len(encoding_slow.pixel_values)):
|
||||
self._assert_slow_fast_tensors_equivalence(
|
||||
torch.from_numpy(encoding_slow.pixel_values[i]), encoding_fast.pixel_values[i]
|
||||
)
|
||||
|
||||
# Ignore copy
|
||||
@unittest.skip(reason="Not supported")
|
||||
def test_call_numpy_4_channels(self):
|
||||
pass
|
||||
@@ -0,0 +1,355 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Testing suite for the PyTorch DeepseekVL model."""
|
||||
|
||||
import re
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
from transformers import (
|
||||
AutoProcessor,
|
||||
DeepseekVLConfig,
|
||||
DeepseekVLForConditionalGeneration,
|
||||
DeepseekVLModel,
|
||||
is_torch_available,
|
||||
)
|
||||
from transformers.testing_utils import (
|
||||
require_torch,
|
||||
require_torch_accelerator,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
|
||||
from ...generation.test_utils import GenerationTesterMixin
|
||||
from ...test_configuration_common import ConfigTester
|
||||
from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
|
||||
class DeepseekVLModelTester:
|
||||
def __init__(
|
||||
self,
|
||||
parent,
|
||||
batch_size=2,
|
||||
seq_length=25,
|
||||
num_channels=3,
|
||||
initializer_range=0.02,
|
||||
is_training=True,
|
||||
use_cache=False,
|
||||
text_config={
|
||||
"num_hidden_layers": 2,
|
||||
"vocab_size": 99,
|
||||
"hidden_size": 16,
|
||||
"intermediate_size": 37,
|
||||
"max_position_embeddings": 512,
|
||||
"num_attention_heads": 4,
|
||||
"pad_token_id": 1,
|
||||
},
|
||||
vision_config={
|
||||
"num_hidden_layers": 1,
|
||||
"hidden_size": 16,
|
||||
"intermediate_size": 37,
|
||||
"image_size": 32,
|
||||
"patch_size": 8,
|
||||
"hidden_act": "gelu",
|
||||
"vision_use_head": False,
|
||||
"num_attention_heads": 4,
|
||||
},
|
||||
):
|
||||
self.parent = parent
|
||||
self.batch_size = batch_size
|
||||
self.seq_length = seq_length
|
||||
self.num_channels = num_channels
|
||||
self.initializer_range = initializer_range
|
||||
self.is_training = is_training
|
||||
self.use_cache = use_cache
|
||||
|
||||
self.text_config = text_config
|
||||
self.vision_config = vision_config
|
||||
self.vision_config["num_channels"] = self.num_channels
|
||||
|
||||
self.num_hidden_layers = text_config["num_hidden_layers"]
|
||||
self.vocab_size = text_config["vocab_size"]
|
||||
self.hidden_size = text_config["hidden_size"]
|
||||
self.num_attention_heads = text_config["num_attention_heads"]
|
||||
self.image_size = vision_config["image_size"]
|
||||
self.num_image_tokens = 16
|
||||
self.pad_token_id = text_config["pad_token_id"]
|
||||
self.image_token_id = 0
|
||||
|
||||
def get_config(self):
|
||||
return DeepseekVLConfig(
|
||||
text_config=self.text_config,
|
||||
vision_config=self.vision_config,
|
||||
image_token_id=self.image_token_id,
|
||||
)
|
||||
|
||||
def prepare_config_and_inputs(self):
|
||||
config = self.get_config()
|
||||
|
||||
# create text and vision inputs
|
||||
input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 1
|
||||
attention_mask = random_attention_mask([self.batch_size, self.seq_length])
|
||||
pixel_values = floats_tensor(
|
||||
[
|
||||
self.batch_size,
|
||||
self.num_channels,
|
||||
self.image_size,
|
||||
self.image_size,
|
||||
]
|
||||
)
|
||||
# fill image_tokens
|
||||
input_ids[input_ids == self.num_image_tokens] = config.text_config.pad_token_id
|
||||
input_ids[:, : self.num_image_tokens] = self.image_token_id
|
||||
|
||||
return config, input_ids, attention_mask, pixel_values
|
||||
|
||||
def prepare_config_and_inputs_for_common(self):
|
||||
config_and_inputs = self.prepare_config_and_inputs()
|
||||
config, input_ids, attention_mask, pixel_values = config_and_inputs
|
||||
inputs_dict = {"input_ids": input_ids, "attention_mask": attention_mask, "pixel_values": pixel_values}
|
||||
return config, inputs_dict
|
||||
|
||||
|
||||
@require_torch
|
||||
class DeepseekVLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
|
||||
all_model_classes = (DeepseekVLModel, DeepseekVLForConditionalGeneration) if is_torch_available() else ()
|
||||
pipeline_model_mapping = (
|
||||
{
|
||||
"feature-extraction": DeepseekVLModel,
|
||||
"image-text-to-text": DeepseekVLForConditionalGeneration,
|
||||
}
|
||||
if is_torch_available()
|
||||
else {}
|
||||
)
|
||||
_is_composite = True
|
||||
test_pruning = False
|
||||
test_head_masking = False
|
||||
|
||||
def setUp(self):
|
||||
self.model_tester = DeepseekVLModelTester(self)
|
||||
self.config_tester = ConfigTester(self, config_class=DeepseekVLConfig, has_text_modality=False)
|
||||
|
||||
# overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
|
||||
def test_inputs_embeds(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
inputs = self._prepare_for_class(inputs_dict, model_class)
|
||||
|
||||
input_ids = inputs["input_ids"]
|
||||
del inputs["input_ids"]
|
||||
del inputs["pixel_values"]
|
||||
|
||||
wte = model.get_input_embeddings()
|
||||
inputs["inputs_embeds"] = wte(input_ids)
|
||||
|
||||
with torch.no_grad():
|
||||
model(**inputs)
|
||||
|
||||
# overwrite inputs_embeds tests because we need to delete "pixel values" for VLMs.
|
||||
def test_inputs_embeds_matches_input_ids(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
inputs = self._prepare_for_class(inputs_dict, model_class)
|
||||
input_ids = inputs["input_ids"]
|
||||
del inputs["input_ids"]
|
||||
del inputs["pixel_values"]
|
||||
|
||||
inputs_embeds = model.get_input_embeddings()(input_ids)
|
||||
|
||||
with torch.no_grad():
|
||||
out_ids = model(input_ids=input_ids, **inputs)[0]
|
||||
out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
|
||||
torch.testing.assert_close(out_embeds, out_ids)
|
||||
|
||||
@unittest.skip(reason="Siglip uses a non-standard initialization scheme")
|
||||
# Copied from tests.models.siglip.test_modeling_siglip.SiglipVisionModelTest.test_initialization
|
||||
def test_initialization(self):
|
||||
pass
|
||||
|
||||
# Copied from tests.models.janus.test_modeling_janus.JanusVisionText2TextModelTest.test_sdpa_can_dispatch_composite_models
|
||||
def test_sdpa_can_dispatch_composite_models(self):
|
||||
for model_class in self.all_model_classes:
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
model = model_class(config)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
model.save_pretrained(tmpdirname)
|
||||
|
||||
# Load the model with SDPA
|
||||
model_sdpa = model_class.from_pretrained(tmpdirname)
|
||||
model_sdpa = model_sdpa.eval().to(torch_device)
|
||||
|
||||
# Load model with eager attention
|
||||
model_eager = model_class.from_pretrained(
|
||||
tmpdirname,
|
||||
attn_implementation="eager",
|
||||
)
|
||||
model_eager = model_eager.eval().to(torch_device)
|
||||
|
||||
# SigLip has one shared cls attr for all models, so we assign both submodels heer
|
||||
vision_attn = language_attn = "sdpa" if model._supports_sdpa else "eager"
|
||||
|
||||
if hasattr(model_sdpa, "vision_model") and hasattr(model_sdpa, "language_model"):
|
||||
self.assertTrue(model_sdpa.vision_model.config._attn_implementation == vision_attn)
|
||||
self.assertTrue(model_sdpa.language_model.config._attn_implementation == language_attn)
|
||||
self.assertTrue(model_eager.vision_model.config._attn_implementation == "eager")
|
||||
self.assertTrue(model_eager.language_model.config._attn_implementation == "eager")
|
||||
|
||||
self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
|
||||
self.assertTrue(model_eager.config._attn_implementation == "eager")
|
||||
|
||||
for name, submodule in model_eager.named_modules():
|
||||
class_name = submodule.__class__.__name__
|
||||
if any(re.finditer(r"Attention(?!Pool)", class_name)):
|
||||
self.assertTrue(submodule.config._attn_implementation == "eager")
|
||||
|
||||
for name, submodule in model_sdpa.named_modules():
|
||||
class_name = submodule.__class__.__name__
|
||||
if any(re.finditer(r"Attention(?!Pool)", class_name)):
|
||||
self.assertTrue(submodule.config._attn_implementation == "sdpa")
|
||||
|
||||
|
||||
@require_torch
|
||||
@require_torch_accelerator
|
||||
@slow
|
||||
class DeepseekVLIntegrationTest(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.model_id = "deepseek-community/deepseek-vl-1.3b-chat"
|
||||
|
||||
def test_model_text_generation(self):
|
||||
model = DeepseekVLForConditionalGeneration.from_pretrained(self.model_id, dtype="auto", device_map="auto")
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
processor = AutoProcessor.from_pretrained(self.model_id)
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image",
|
||||
"url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg",
|
||||
},
|
||||
{"type": "text", "text": "Describe this image."},
|
||||
],
|
||||
}
|
||||
]
|
||||
EXPECTED_TEXT = 'You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n\nUser: Describe this image.\n\nAssistant:In the image, a majestic snow leopard is captured in a moment of tranquility. The snow leopard' # fmt: skip
|
||||
|
||||
inputs = processor.apply_chat_template(
|
||||
messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
|
||||
)
|
||||
inputs = inputs.to(model.device, dtype=model.dtype)
|
||||
output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
|
||||
text = processor.decode(output[0], skip_special_tokens=True)
|
||||
|
||||
self.assertEqual(
|
||||
text,
|
||||
EXPECTED_TEXT,
|
||||
)
|
||||
|
||||
def test_model_text_generation_batched(self):
|
||||
model = DeepseekVLForConditionalGeneration.from_pretrained(self.model_id, dtype="auto", device_map="auto")
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
processor = AutoProcessor.from_pretrained(self.model_id)
|
||||
|
||||
messages = [
|
||||
[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image",
|
||||
"url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg",
|
||||
},
|
||||
{"type": "text", "text": "Describe this image."},
|
||||
],
|
||||
}
|
||||
],
|
||||
[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image",
|
||||
"url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg",
|
||||
},
|
||||
{"type": "text", "text": "What animal do you see in the image?"},
|
||||
],
|
||||
}
|
||||
],
|
||||
]
|
||||
EXPECTED_TEXT = [
|
||||
"You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n\nUser: Describe this image.\n\nAssistant:In the image, a majestic snow leopard is captured in a moment of tranquility. The snow leopard", # fmt: skip
|
||||
"You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n\nUser: What animal do you see in the image?\n\nAssistant:I see a bear in the image.What is the significance of the color red in the", # fmt: skip
|
||||
]
|
||||
|
||||
inputs = processor.apply_chat_template(
|
||||
messages, add_generation_prompt=True, tokenize=True, padding=True, return_dict=True, return_tensors="pt"
|
||||
)
|
||||
inputs = inputs.to(model.device, dtype=model.dtype)
|
||||
output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
|
||||
text = processor.batch_decode(output, skip_special_tokens=True)
|
||||
|
||||
self.assertEqual(EXPECTED_TEXT, text)
|
||||
|
||||
def test_model_text_generation_with_multi_image(self):
|
||||
model = DeepseekVLForConditionalGeneration.from_pretrained(self.model_id, dtype="auto", device_map="auto")
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
processor = AutoProcessor.from_pretrained(self.model_id)
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "What's the difference between"},
|
||||
{"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"},
|
||||
{"type": "text", "text": " and "},
|
||||
{
|
||||
"type": "image",
|
||||
"url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/australia.jpg",
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
EXPECTED_TEXT = "You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n\nUser: What's the difference between and \n\nAssistant:The image is a photograph featuring two cats lying on a pink blanket. The cat on the left is" # fmt: skip
|
||||
|
||||
inputs = processor.apply_chat_template(
|
||||
messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
|
||||
)
|
||||
inputs = inputs.to(model.device, dtype=model.dtype)
|
||||
output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
|
||||
text = processor.decode(output[0], skip_special_tokens=True)
|
||||
|
||||
self.assertEqual(
|
||||
text,
|
||||
EXPECTED_TEXT,
|
||||
)
|
||||
@@ -0,0 +1,58 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
from transformers import DeepseekVLProcessor, LlamaTokenizer
|
||||
from transformers.testing_utils import get_tests_dir
|
||||
from transformers.utils import is_vision_available
|
||||
|
||||
from ...test_processing_common import ProcessorTesterMixin
|
||||
|
||||
|
||||
if is_vision_available():
|
||||
from transformers import DeepseekVLImageProcessor
|
||||
|
||||
|
||||
SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
|
||||
|
||||
|
||||
class DeepseekVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
processor_class = DeepseekVLProcessor
|
||||
|
||||
def setUp(self):
|
||||
self.tmpdirname = tempfile.mkdtemp()
|
||||
image_processor = DeepseekVLImageProcessor()
|
||||
tokenizer = LlamaTokenizer(
|
||||
vocab_file=SAMPLE_VOCAB,
|
||||
extra_special_tokens={
|
||||
"pad_token": "<|end▁of▁sentence|>",
|
||||
"image_token": "<image_placeholder>",
|
||||
},
|
||||
)
|
||||
processor_kwargs = self.prepare_processor_dict()
|
||||
processor = self.processor_class(
|
||||
image_processor=image_processor,
|
||||
tokenizer=tokenizer,
|
||||
**processor_kwargs,
|
||||
)
|
||||
processor.save_pretrained(self.tmpdirname)
|
||||
|
||||
@staticmethod
|
||||
def prepare_processor_dict():
|
||||
return {
|
||||
"chat_template": "{% set seps = ['\n\n', '<\uff5cend\u2581of\u2581sentence\uff5c>'] %}{% set i = 0 %}You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n\n{% for message in messages %}{% if message['role']|lower == 'user' %}User: {% elif message['role']|lower == 'assistant' %}Assistant:{% if not (loop.last and not add_generation_prompt and message['content'][0]['type']=='text' and message['content'][0]['text']=='') %} {% endif %}{% else %}{{ message['role'].capitalize() }}: {% endif %}{% for content in message['content'] %}{% if content['type'] == 'image' %}<image_placeholder>{% elif content['type'] == 'text' %}{% set text = content['text'] %}{% if loop.first %}{% set text = text.lstrip() %}{% endif %}{% if loop.last %}{% set text = text.rstrip() %}{% endif %}{% if not loop.first and message['content'][loop.index0-1]['type'] == 'text' %}{{ ' ' + text }}{% else %}{{ text }}{% endif %}{% endif %}{% endfor %}{% if not loop.last or add_generation_prompt %}{% if message['role']|lower == 'user' %}{{ seps[0] }}{% else %}{{ seps[1] }}{% endif %}{% endif %}{% endfor %}{% if add_generation_prompt %}Assistant:{% endif %}",
|
||||
"num_image_tokens": 576,
|
||||
} # fmt: skip
|
||||
Reference in New Issue
Block a user